Ejemplo n.º 1
0
def get_emails(db):
    path = "trec06c/data"
    dirs = os.listdir(path)
    new_emails = []
    i = 0
    for dir in dirs:
        new_emails.clear()
        user = MongoDB.choose_user(db, dir)
        emails = os.listdir(path + "/" + dir)
        j = 0
        for e in emails:
            print(dir + "/" + e, end=" ")
            text = fc.get_text(path + "/" + dir + "/" + e)
            print("邮件长度:", len(text))
            # 打印某封邮件内容
            j = j + 1
            msg = email.message_from_string(text)
            title, addresser, addressee, copy = pe.parse_header(msg)
            date = get_date(text)
            print(date)
            content = get_content(text)
            doc = re.split('。|;|·|!|?|\n', content)
            doc = list(filter(None, doc))
            split = []
            for i in range(len(doc)):
                split.append(JB.participle_text(doc[i]))
                doc[i] = doc[i] + "。"
            # print(split)
            emailKind = ""
            new_email = myEmail.set_email(title, addresser, addressee, copy,
                                          date, doc, split, emailKind)
            new_emails.append(new_email)
        i = i + 1
        MongoDB.insert_many(user, new_emails)
Ejemplo n.º 2
0
def get_emails(db):
    path = "trec06c/dao"
    dirs = os.listdir(path)
    for dir in dirs:
        user = MongoDB.choose_user(db, dir)
        emails = os.listdir(path + "/" + dir)
        new_emails = []
        for e in emails:
            email_path = path + "/" + dir + "/" + e
            print(email_path)
            text = fc.get_text(email_path)
            msg = email.message_from_string(text)
            title, addresser, addressee, copy = parse_header(msg)
            content = parse_body(msg)
            content = JB.participle_text(content)
            new_email = myEmail.set_email(title, addresser, addressee, copy, content)
            new_emails.append(new_email)

        # mongodb.insert_many(user, new_emails)

# # 连接邮箱
# host = "imap.qq.com"
# username = "******"
# password = "******"
# title, addresser, addressee, copy, content = get_mail(host, username, password)
# # 中文分词
# content = JB.participle_text(content)
# # 添加至MongoDB数据库
# new_email = myEmail.set_email(title, addresser, addressee, copy, content)
# myClient = mongodb.connect_mongodb()
# emaildb = mongodb.choose_database(myClient)
# user = mongodb.choose_user(emaildb, "demo")
# mongodb.insert_one(user, new_email)
# mongodb.disconnect_mongodb(myClient)
Ejemplo n.º 3
0
def statistics_text_max():
    file_txt = fc.get_text()
    words = jieba.cut(file_txt)
    counts = {}

    for word in words:
        if len(word) == 1:  # 单个单词不考虑
            continue
        else:
            counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    for i in range(5):
        word, count = items[i]
        print("{0:<5}->{1:>5}".format(word, count))