def get_emails(db): path = "trec06c/data" dirs = os.listdir(path) new_emails = [] i = 0 for dir in dirs: new_emails.clear() user = MongoDB.choose_user(db, dir) emails = os.listdir(path + "/" + dir) j = 0 for e in emails: print(dir + "/" + e, end=" ") text = fc.get_text(path + "/" + dir + "/" + e) print("邮件长度:", len(text)) # 打印某封邮件内容 j = j + 1 msg = email.message_from_string(text) title, addresser, addressee, copy = pe.parse_header(msg) date = get_date(text) print(date) content = get_content(text) doc = re.split('。|;|·|!|?|\n', content) doc = list(filter(None, doc)) split = [] for i in range(len(doc)): split.append(JB.participle_text(doc[i])) doc[i] = doc[i] + "。" # print(split) emailKind = "" new_email = myEmail.set_email(title, addresser, addressee, copy, date, doc, split, emailKind) new_emails.append(new_email) i = i + 1 MongoDB.insert_many(user, new_emails)
def get_emails(db): path = "trec06c/dao" dirs = os.listdir(path) for dir in dirs: user = MongoDB.choose_user(db, dir) emails = os.listdir(path + "/" + dir) new_emails = [] for e in emails: email_path = path + "/" + dir + "/" + e print(email_path) text = fc.get_text(email_path) msg = email.message_from_string(text) title, addresser, addressee, copy = parse_header(msg) content = parse_body(msg) content = JB.participle_text(content) new_email = myEmail.set_email(title, addresser, addressee, copy, content) new_emails.append(new_email) # mongodb.insert_many(user, new_emails) # # 连接邮箱 # host = "imap.qq.com" # username = "******" # password = "******" # title, addresser, addressee, copy, content = get_mail(host, username, password) # # 中文分词 # content = JB.participle_text(content) # # 添加至MongoDB数据库 # new_email = myEmail.set_email(title, addresser, addressee, copy, content) # myClient = mongodb.connect_mongodb() # emaildb = mongodb.choose_database(myClient) # user = mongodb.choose_user(emaildb, "demo") # mongodb.insert_one(user, new_email) # mongodb.disconnect_mongodb(myClient)
def statistics_text_max(): file_txt = fc.get_text() words = jieba.cut(file_txt) counts = {} for word in words: if len(word) == 1: # 单个单词不考虑 continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(5): word, count = items[i] print("{0:<5}->{1:>5}".format(word, count))