Beispiel #1
0
def get_emails(db):
    path = "trec06c/dao"
    dirs = os.listdir(path)
    for dir in dirs:
        user = MongoDB.choose_user(db, dir)
        emails = os.listdir(path + "/" + dir)
        new_emails = []
        for e in emails:
            email_path = path + "/" + dir + "/" + e
            print(email_path)
            text = fc.get_text(email_path)
            msg = email.message_from_string(text)
            title, addresser, addressee, copy = parse_header(msg)
            content = parse_body(msg)
            content = JB.participle_text(content)
            new_email = myEmail.set_email(title, addresser, addressee, copy, content)
            new_emails.append(new_email)

        # mongodb.insert_many(user, new_emails)

# # 连接邮箱
# host = "imap.qq.com"
# username = "******"
# password = "******"
# title, addresser, addressee, copy, content = get_mail(host, username, password)
# # 中文分词
# content = JB.participle_text(content)
# # 添加至MongoDB数据库
# new_email = myEmail.set_email(title, addresser, addressee, copy, content)
# myClient = mongodb.connect_mongodb()
# emaildb = mongodb.choose_database(myClient)
# user = mongodb.choose_user(emaildb, "demo")
# mongodb.insert_one(user, new_email)
# mongodb.disconnect_mongodb(myClient)
def get_emails(db):
    path = "trec06c/data"
    dirs = os.listdir(path)
    new_emails = []
    i = 0
    for dir in dirs:
        new_emails.clear()
        user = MongoDB.choose_user(db, dir)
        emails = os.listdir(path + "/" + dir)
        j = 0
        for e in emails:
            print(dir + "/" + e, end=" ")
            text = fc.get_text(path + "/" + dir + "/" + e)
            print("邮件长度:", len(text))
            # 打印某封邮件内容
            j = j + 1
            msg = email.message_from_string(text)
            title, addresser, addressee, copy = pe.parse_header(msg)
            date = get_date(text)
            print(date)
            content = get_content(text)
            doc = re.split('。|;|·|!|?|\n', content)
            doc = list(filter(None, doc))
            split = []
            for i in range(len(doc)):
                split.append(JB.participle_text(doc[i]))
                doc[i] = doc[i] + "。"
            # print(split)
            emailKind = ""
            new_email = myEmail.set_email(title, addresser, addressee, copy,
                                          date, doc, split, emailKind)
            new_emails.append(new_email)
        i = i + 1
        MongoDB.insert_many(user, new_emails)
def keywords_extraction_tt():
    file_txt = fc.get_text_2()
    keys = jieba.analyse.textrank(file_txt,
                                  topK=20,
                                  withWeight=True,
                                  allowPOS=("ns", "n", "vn", "v"))
    for word, weight in keys:
        print(word, weight)
def keywords_extraction_tf():
    file_txt = fc.get_text_2()
    keys = jieba.analyse.extract_tags(file_txt,
                                      topK=20,
                                      withWeight=True,
                                      allowPOS=())
    for word, weight in keys:
        print(word, weight)
Beispiel #5
0
def participle_text(text):
    stop_words = fc.get_stop_words()
    words = posseg.cut(text)  # 默认模式为精准模式,适合文本分析
    result = ""
    i = 0
    for word in words:
        if word.word not in stop_words:
            if i > 0:
                result = result + " " + word.word
            else:
                result = result + word.word
            i = i + 1
    return result
Beispiel #6
0
def statistics_text_max():
    file_txt = fc.get_text()
    words = jieba.cut(file_txt)
    counts = {}

    for word in words:
        if len(word) == 1:  # 单个单词不考虑
            continue
        else:
            counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    for i in range(5):
        word, count = items[i]
        print("{0:<5}->{1:>5}".format(word, count))