Esempio n. 1
0
def seg_sentence(sentence):
    sentence_seged=jie_test.cut(sentence.strip())
    stopwords=stopwordslist(dir_data+'stopwords/stops.txt')
    outstr=''
    for word in sentence_seged:
        if word not in stopwords:
            if word !='\t':
                outstr +=word
                outstr +=" "

    return outstr
text_data=os.listdir(text_data_dir)

for textName in sorted(text_data):
    textpath = os.path.join(text_data_dir, textName)
    if os.path.isdir(textpath):
        label_id = len(labels_index)
        labels_index[textName] = label_id
        for wordname in sorted(os.listdir(textpath)):
            wordpath = os.path.join(textpath, wordname)
            if sys.version_info < (3,):
                f = open(wordpath)
            else:
                f = open(wordpath, encoding='gbk',errors='ignore')

            text = f.read()
            seg_list = jie_test.cut(text.strip().replace(' ', ''), cut_all=False)
            new_content = " ".join(seg_list)
            # texts.append(new_content)
            seg_list1=PinyinHelper.convertToPinyinFromSentence(new_content)
            print(seg_list1)
            texts.append(seg_list1)
            f.close()
            labels.append(label_id)
for tes in texts:
    print("------>>")
    print(tes)
print('Found %s texts.' % len(texts))



tokenizer = Tokenizer(num_words=max_nb_words, filters="",oov_token="unk")
Esempio n. 3
0
import jie_test
from gensim import corpora,models

train_set=[]

walk=os.walk(dir_data+datasets)
print(walk)

for root,dir,files in walk:
    for name in files:
        f=open(os.path.join(root,name),
               'r',
               encoding='utf-8',
               errors='ignore')
    raw=f.read()
    word_list=list(jie_test.cut(raw, cut_all=False))
    train_set.append(word_list)
    print(word_list)

dic=corpora.Dictionary(train_set)
print(dic)

corpus=[dic.doc2bow(text) for text in train_set]
tfidf=models.TfidfModel(corpus)
corpus_tfidf=tfidf[corpus]
print(corpus_tfidf)
LDA=models.LdaModel(corpus_tfidf,
                    id2word=dic,
                    num_topics=10)
corpus_LDA=LDA[corpus_tfidf]