def seg_sentence(sentence): sentence_seged=jie_test.cut(sentence.strip()) stopwords=stopwordslist(dir_data+'stopwords/stops.txt') outstr='' for word in sentence_seged: if word not in stopwords: if word !='\t': outstr +=word outstr +=" " return outstr
text_data=os.listdir(text_data_dir) for textName in sorted(text_data): textpath = os.path.join(text_data_dir, textName) if os.path.isdir(textpath): label_id = len(labels_index) labels_index[textName] = label_id for wordname in sorted(os.listdir(textpath)): wordpath = os.path.join(textpath, wordname) if sys.version_info < (3,): f = open(wordpath) else: f = open(wordpath, encoding='gbk',errors='ignore') text = f.read() seg_list = jie_test.cut(text.strip().replace(' ', ''), cut_all=False) new_content = " ".join(seg_list) # texts.append(new_content) seg_list1=PinyinHelper.convertToPinyinFromSentence(new_content) print(seg_list1) texts.append(seg_list1) f.close() labels.append(label_id) for tes in texts: print("------>>") print(tes) print('Found %s texts.' % len(texts)) tokenizer = Tokenizer(num_words=max_nb_words, filters="",oov_token="unk")
import jie_test from gensim import corpora,models train_set=[] walk=os.walk(dir_data+datasets) print(walk) for root,dir,files in walk: for name in files: f=open(os.path.join(root,name), 'r', encoding='utf-8', errors='ignore') raw=f.read() word_list=list(jie_test.cut(raw, cut_all=False)) train_set.append(word_list) print(word_list) dic=corpora.Dictionary(train_set) print(dic) corpus=[dic.doc2bow(text) for text in train_set] tfidf=models.TfidfModel(corpus) corpus_tfidf=tfidf[corpus] print(corpus_tfidf) LDA=models.LdaModel(corpus_tfidf, id2word=dic, num_topics=10) corpus_LDA=LDA[corpus_tfidf]