Exemple #1
0
# coding:utf8

"""
DESC:自定义去高低词频
Author:伏草惟存
Prompt: code in Python3 env
"""

from StopWords import readFile,seg_doc
from FreqWord import *


def freqword(fdist):
    wordlist =[]
    print('='*3,'打印统计的词频','='*3)
    for key in fdist.keys():
        if fdist.get(key)>2 and fdist.get(key)<15:
            wordlist.append(key+':'+str(fdist.get(key)))
    return wordlist


if __name__=='__main__':
    # 1 读取文本
    path= r'../Corpus/CSCMNews/体育/0.txt'
    str_doc = readFile(path)
    word_list =seg_doc(str_doc)
    # 2 选择高低词
    fdist = nltk_wf_feature(word_list)
    wordlist=freqword(fdist)
    print(wordlist)
Exemple #2
0
        # for key, value in dictonary.items():

    # print(corpus_tfidf)
    return corpus_tfidf
    # 将每一类保存到文件中
    # catgs = list(corpus_tfidf.keys())  # ['体育', '时政']
    # for catg in catgs:
    #     savepath = r'./'
    #     corpora.MmCorpus.serialize(r'{f}{s}{c}.mm'.format(f=savepath, s=os.sep, c=catg), corpus_tfidf.get(catg), id2word=dictonary)


if __name__ == '__main__':
    path = r"../CSCMNews/体育/0.txt"
    str_doc = readFile(path)

    word_list = seg_doc(str_doc)
    word_str = " ".join(word_list)

    path2 = r"../CSCMNews/时政/339764.txt"
    str_doc2 = readFile(path2)
    word_list2 = seg_doc(str_doc2)
    # print(type(word_list2))
    word_str2 = " ".join(word_list2)

    corpus = []  # 每个文本为字符串
    classVec = ["体育", "时政"]
    corpus.append(word_str)
    corpus.append(word_str2)
    # 使用sklearn 计算词的tfidf
    a = tfidf_sklearn_feature(
        corpus, classVec)  # len(a["体育"][0]), len(a["时政"][0])   242, 242
    # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer() # 构建词汇表
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        print(u"-------这里输出第", i, u"类文本的词语tf-idf权重------")
        for j in range(len(word)):
            print(word[j], weight[i][j])

if __name__=='__main__':
    corpus =[]
    # corpus参数样例数据如下:
    # corpus = ["我 来到 成都 成都 春熙路 很 开心",
              # "今天 在 宽窄巷子 耍 了 一天 ",
              # "成都 整体 来说 还是 挺 安逸 的",
              # "成都 的 美食 真 巴适 惨 了"]
    # 1 读取文本
    path1= r'../dataSet/CSCMNews/体育/0.txt'
    str_doc1 = readFile(path1)
    word_list1 =' '.join(seg_doc(str_doc1))

    path2= r'../dataSet/CSCMNews/时政/339764.txt'
    str_doc2 = readFile(path2)
    word_list2 =' '.join(seg_doc(str_doc2))

    corpus.append(word_list1)
    corpus.append(word_list2)

    sklearn_tfidf_feature(corpus)
Exemple #4
0
            catg = folder.split(os.sep)[-1]
            for file in os.listdir(folder):  # level2 directory
                file_path = os.path.join(folder, file)
                # 文件具体操作
                if os.path.isfile(file_path):
                    this_file = open(file_path, 'rb')  # rb读取更快
                    content = this_file.read().decode('utf8')
                yield catg, content
                this_file.close()


if __name__ == '__main__':
    start = time.time()

    filepath = os.path.abspath(
        r'C:\Users\Chengyu.Dean\Desktop\树维\江西省挖煤\new_江西省报告')
    files = loadFiles(filepath)
    # n = 5  # n表示抽样率
    for i, msg in enumerate(files):
        catg = msg[0]
        content = msg[1]
        content = seg_doc(content)
        print(
            '{t}***{i}\t docs has been dealed'.format(i=i,
                                                      t=time.strftime(
                                                          '%Y-%m-%d %H:%M:%S',
                                                          time.localtime())),
            '\n', catg, ':\t', content[:20])
    end = time.time()
    print('total cost time: %.2f' % (end - start) + 's')