def main(): parser = ArgumentParser() parser.add_argument('-d', '--wiki-dump') parser.add_argument('-l', '--limit', default=None, type=int) parser.add_argument('-p', '--num-procs', default=1, type=int) parser.add_argument('-o', '--out', default='vocab') opts = parser.parse_args() dump_loc = opts.wiki_dump limit = opts.limit n_procs = opts.num_procs out_fn = opts.out dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs) nlp = spacy.en.English() vocab = Dictionary(([ token.text.lower().strip() for token in doc if token.text.strip() != "" ] for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs, parse=False, tag=False, entity=False))) vocab.save('%s.vocab' % out_fn) vocab.save_as_text('%s.txt' % out_fn)
def initialize_lda(): path = os.path.join("../data", "train.csv") dct = Dictionary(common_texts) corpus = [dct.doc2bow(text) for text in common_texts] with open(path, 'r') as file: csv_file = csv.DictReader(file) for row in csv_file: row = dict(row) new_texts = [row['story'].split()] dct.add_documents(new_texts) corpus += [dct.doc2bow(text) for text in new_texts] lda = models.ldamodel.LdaModel(corpus, num_topics=50) lda.save(os.path.join("lda_model", "model")) dct.save_as_text(os.path.join("lda_model", "dictionary"))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', default='./data/test_arxiv_plain.txt', help='Path to directory where the data is stored') parser.add_argument('--model-dir', default='../model', help='Path to directory where the model is stored') parser.add_argument('--train', default=True, help='True for train, False for test mode') parser.add_argument('--n_topic', default=20, help='Number of of topics') args = parser.parse_args() model_dir = './model/model' dict_dir = './model/dict.txt' if args.train == True: print('Reading texts') with open(args.data_dir) as f_in: texts = f_in.read().split('\n') del texts[-1] for i in tqdm(range(len(texts))): texts[i] = texts[i].split() print('Generating corpora') dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary.save_as_text(dict_dir) print('Loading model') lda = LdaModel(corpus, num_topics=args.n_topic) lda.save(model_dir) else: lda = LdaModel.load(model_dir, mmap='r') dictionary = Dictionary() dictionary.load_from_text(dict_dir) print('Processing results') topics = lda.print_topics() with open('./report.txt', 'w') as f_out: for topic_id, topic_pair in topics: print(topic_id, end=': ', file=f_out) topic_words = topic_pair.split('"')[1::2] topic_words = list(map(int, topic_words)) topic_words = [dictionary.get(word) for word in topic_words] print(topic_words, file=f_out)
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def main(): parser = ArgumentParser() parser.add_argument('-d', '--wiki-dump') parser.add_argument('-l', '--limit', default=None, type=int) parser.add_argument('-p', '--num-procs', default=1, type=int) parser.add_argument('-o', '--out', default='vocab') opts = parser.parse_args() dump_loc = opts.wiki_dump limit = opts.limit n_procs = opts.num_procs out_fn = opts.out dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs) nlp = spacy.en.English() vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""] for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs, parse=False, tag=False, entity=False))) vocab.save('%s.vocab' % out_fn) vocab.save_as_text('%s.txt' % out_fn)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--use_domain", action="store_true") parser.add_argument("--update", action="store_true") parser.add_argument("--save_interval", type=int, default=100) args = parser.parse_args() if args.update: common_dict = Dictionary.load_from_text("./common_dict.txt") else: common_dict = Dictionary() for i, url in enumerate(sys.stdin): print("url " + str(i)) text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain) if not text: continue word_list = doc2word_list(text) common_dict.add_documents([word_list]) if i % args.save_interval == args.save_interval - 1: common_dict.save_as_text("./common_dict.txt") common_dict.save_as_text("./common_dict.txt")
def build_lda_model(stem): corpus = [] ps = PorterStemmer() number_of_topics = 100 # read in data from publications with open(get_lda_base(), 'r') as f: for line in f: if stem: stemmed = [] for w in line.split(): s = ps.stem(w) if len(s) > 1: stemmed.append(s) corpus.append(stemmed) else: corpus.append(line.split()) # build vocabulary and transform texts in vocab format dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] # do lda lda = ldamodel.LdaModel(corpus=corpus, num_topics=number_of_topics, passes=20, id2word=dictionary, minimum_probability=0) if stem: temp_file = datapath('lda_model_stemmed') dictionary.save_as_text(get_file_base() + 'lda_data/dict_stemmed') else: temp_file = datapath('lda_model_unstemmed') dictionary.save_as_text(get_file_base() + 'lda_data/dict_unstemmed') lda.save(temp_file)
class text_corpus(object): def __init__(self, tsv_path, n_examples=100000): print("Getting %s iterator..." % tsv_path) self.n_examples = n_examples self.document_path = tsv_path self.fin = open(self.document_path, 'rb') self.instances = sum(1 for line in open(tsv_path)) self.bigram = Phraser(Phrases()) self.trigram = Phraser(Phrases()) def __iter__(self): for i, doc in self.indexed_docs(self.n_examples): yield TaggedDocument(self.process(doc), [i]) def process(self, text): return self.trigram[self.bigram[tokenize(text)]] def docs(self, n_examples=None): if n_examples == None: n_examples = self.n_examples for _, doc in self.indexed_docs(n_examples): yield self.process(doc) def reset_docs(self): self.fin.close() self.fin = open(self.document_path, 'rb') def indexed_docs(self, n_examples=-1): if n_examples == -1: with open(self.document_path, 'rb') as fin: for line in fin: try: i, doc = line.decode( 'utf-8', errors='replace').strip().split('\t') yield i, doc except: pass else: current_example = 0 for line in self.fin: if (current_example < n_examples): try: i, doc = line.decode( 'utf-8', errors='replace').strip().split('\t') current_example += 1 yield i, doc except: pass else: raise StopIteration def get_phraser(self, directory, sensitivity=3): if not os.path.isdir(directory): os.makedirs(directory) print("\t\tGetting bigram detector...") if not os.path.isfile(directory + '/bigrams.pkl'): self.bigram = Phraser( Phrases(self.docs(n_examples=-1), min_count=2, threshold=sensitivity, max_vocab_size=2000000)) self.bigram.save(directory + '/bigrams.pkl') else: self.bigram = Phraser.load(directory + '/bigrams.pkl') print("\t\tGetting trigram detector...") if not os.path.isfile(directory + '/trigrams.pkl'): self.trigram = Phraser( Phrases(self.bigram[self.docs(n_examples=-1)], min_count=2, threshold=sensitivity + 1, max_vocab_size=2000000)) self.trigram.save(directory + '/trigrams.pkl') else: self.trigram = Phraser.load(directory + '/trigrams.pkl') def load_phraser(self, directory): print("\tLoading gram detector...") self.bigram = Phraser.load(directory + '/bigrams.pkl') self.trigram = Phraser.load(directory + '/trigrams.pkl') def get_dictionary(self, directory, keep=100000): if not os.path.isdir(directory): os.makedirs(directory) if not os.path.isfile(directory + '/dictionary.dict'): print("\tBuilding dictionary...") self.dictionary = Dictionary(self.docs(n_examples=-1), prune_at=2000000) print("\tFiltering dictionary extremes...") self.dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=keep) print("\tSaving dictionary...") self.dictionary.save(directory + '/dictionary.dict') self.dictionary.save_as_text(directory + '/word_list.tsv') else: self.load_dictionary(directory) def get_word_ids(self): word_list = set() for doc in self.docs(n_examples=-1): word_list.update(doc) return dict(zip(range(len(word_list)), word_list)) def load_dictionary(self, directory): print("\tLoading dictionary...") self.dictionary = Dictionary.load(directory + '/dictionary.dict')
class LDAModel(object): """ """ def __init__(self,path,model_file,dictionary_file,corpus_file,num_topics=21): """ 进行数据预处理,获取训练集和测试集 class biological分子与细胞_cleaned.csv : 12 class biological现代生物技术专题_cleaned.csv : 14 class biological生物技术实践_cleaned.csv : 16 class biological生物科学与社会_cleaned.csv : 18 class biological稳态与环境_cleaned.csv : 110 class biological遗传与进化_cleaned.csv : 112 class geography人口与城市_cleaned.csv : 42 class geography区域可持续发展_cleaned.csv : 44 class geography地球与地图_cleaned.csv : 46 class geography宇宙中的地球_cleaned.csv : 48 class geography生产活动与地域联系_cleaned.csv : 410 class history古代史_cleaned.csv : 52 class history现代史_cleaned.csv : 54 AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks' Exception ignored in: '_pydevd_frame_eval.pydevd_frame_evaluator_darwin_36_64.get_bytecode_while_frame_eval' AttributeError: 'PyDB' object has no attribute 'has_plugin_line_breaks' class history近代史_cleaned.csv : 56 class political公民道德与伦理常识_cleaned.csv : 102 class political时事政治_cleaned.csv : 104 class political生活中的法律常识_cleaned.csv : 106 class political科学思维常识_cleaned.csv : 108 class political科学社会主义常识_cleaned.csv : 1010 class political经济学常识_cleaned.csv : 1012 :param file:语料文件 :param ratio:测试训练的比列 :return lda:返回lda模型 """ dirs = os.listdir(path) x_list = [] item_x = [] labels = [] multiLabels = [] label11 = 0 for file in dirs: #print(os.path.join(path, file)) path2 = os.path.join(path, file) if os.path.isdir(path2): category = file dirs2 = os.listdir(path2) label12 = 0 for file2 in dirs2: file3 = os.path.join(path2, file2) if os.path.isfile(file3) and file2.endswith('_cleaned.csv'): print('class {}{} : {}{}'.format(file, file2, label11, label12)) src_df = pd.read_csv(file3) src_df = parallelize(src_df, data_fram_proc) #上采样 #merged_df = pd.concat([src_df['items'], src_df['knowledge']], axis=1) src_df['item'] = src_df['items'] + src_df['knowledge'] x = np.array(src_df['item']).tolist() item_x += x x = [[word for word in doc.split(' ') if word != "" ] for doc in x] x_list+= x # list #labels += ['__label__'+str(label11)+''+str(label12) for i in range(len(x))] fn = str(file2).replace('_cleaned.csv','').replace('\t','').replace('\n','') labels += ['__label__' + str(file) + '_' + fn for i in range(len(x))] bug = 0 mls = np.array(src_df['label']).tolist() multiLabels += [ str(file).replace('_',' ') +' '+fn+' '+ str(ml).replace('\t','').replace('\n','') for ml in mls ] bug = 1 label12 += 1 label11 += 1 c = {'label': labels,'item': item_x,'multiLabels':multiLabels} # 合并成一个新的字典c df = pd.DataFrame(c) # 将c传入DataFrame并创建 df.to_csv(corpus_file, index=None, header=True) # 把文章转成list,字典里面 "token2id " self.dictionary = Dictionary(x_list) # 把文本转成词袋形式 id : freq self.corpus = [self.dictionary.doc2bow(text) for text in x_list] # 调用lda模型,并指定10个主题 self.lda = LdaModel(self.corpus, id2word=self.dictionary, num_topics=num_topics) # 检查结果 results = self.lda.print_topics(num_topics, num_words=50) for result in results: print(result) # Save model to disk. self.lda.save(model_file) self.dictionary.save_as_text(dictionary_file) def __retrain(self, model_file,other_texts): """ lda = LdaModel.load(model_file) other_corpus = [self.dictionary.doc2bow(text) for text in other_texts] lda.update(other_corpus) """ def getDocSVector(self): self.docSVector = [] for d in self.corpus: self.docSVector.append(self.lda.get_document_topics(d,minimum_probability = 0)) return self.docSVector