def cluster_questions(topic_num, res_path, q_path='datasets\DialogQA\Qall.txt', a_path='datasets\DialogQA\Aall.txt'): with open(a_path, 'r', encoding='utf-8') as f: common_texts = [text.split() for text in f.readlines()] with open(q_path, 'r', encoding='utf-8') as f: questions = [text for text in f.readlines()] common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=topic_num) questions_clusterd = [[] for i in range(topic_num)] print('Questions : ', len(questions)) perp = lda.log_perplexity(common_corpus) for i, q in enumerate(questions): other_corpus = [common_dictionary.doc2bow(common_texts[i])] vector = lda[other_corpus] # print(vector[0]) max_prob = 0 for (idx, prob) in vector[0]: # print(idx) if prob > max_prob: topic = idx max_prob = prob questions_clusterd[topic].append(q) # print(topic) if (not os._exists(res_path)): os.makedirs(res_path) for top in range(topic_num): with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f: for quest in questions_clusterd[top]: f.write(quest) # f.write('\n') return perp
def calcCoherence(lemmatizedTexts, passes=100, nTopics=5, workers = 1): id2word = Dictionary(lemmatizedTexts) corp = [id2word.doc2bow(text) for text in lemmatizedTexts] ldaModel = gensim.models.LdaMulticore( corpus=corp, id2word=id2word, num_topics=nTopics, passes=passes, random_state=100, per_word_topics=False, alpha=0.01, eta=0.9, workers=workers ) coherenceModel = CoherenceModel( model=ldaModel, texts=lemmatizedTexts, dictionary=id2word, coherence='c_v', processes=0 ) return coherenceModel.get_coherence()
def build_model(raw_file, ret_file): """ :param raw_file: :param retfile: :return: """ all_tweets = load_all_tweets(raw_file) k = int(ret_file[ret_file.find('tweets_lda_') + 11]) print('k={}'.format(k)) idx2twetid = [] common_texts = [] for key, tweet in all_tweets.items(): idx2twetid.append(key) tokens = tweet['cleaned'].split(' ') text = [] for token in tokens: if token not in punc_words: text.append(token) common_texts.append(text) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] print('begin to train') lda_model = LdaModel(common_corpus, id2word=common_dictionary, num_topics=k, random_state=13) pprint(lda_model.print_topics(num_words=20)) print('\nPerplexity: ', lda_model.log_perplexity(common_corpus)) with open(ret_file, 'w', encoding='utf-8') as fout: for i, tweetid in enumerate(idx2twetid): tmp = lda_model[common_corpus[i]] lda_score = {} for ele in tmp: lda_score[str(ele[0])] = float(ele[1]) all_tweets[tweetid]['lda' + str(k)] = lda_score fout.write(json.dumps(all_tweets[tweetid])) fout.write('\n')
def __init__(self, fname, processes=None, dictionary=None, filter_namespaces=('0', )): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes if dictionary is None: self.dictionary = Dictionary([[]]) else: self.dictionary = dictionary
def main(): client = MongoClient('localhost', 27017) db = client["discursoDB"] discursos = db["discursos"] # print(discursos.find()[0]['Conteudo']) corpus = [] for disc in discursos.find(): discurso_text = disc["Conteudo"] corpus.append(discurso_text) print(len(corpus)) # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # Train the model on the corpus. lda = LdaMulticore(common_corpus, num_topics=10)
def create_dictionaries(data, model, feature): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2idx = {v: k + 1 for k, v in gensim_dict.items()} w2idxl = {v.lower(): k + 1 for k, v in gensim_dict.items()} #w2vec = {word: model[word.lower()] for word in w2idx.keys()} w2vec = {} for word in w2idx.keys(): if feature == 'bow': try: w2vec[word.lower()] = model[word] except KeyError: w2vec[word.lower()] = [0] * model.vector_size else: try: w2vec[word] = model[word] except KeyError: w2vec[word] = [0] * model.vector_size def parse_dataset(data, feature): for key in data.keys(): if feature == 'bow': txt = data[key].lower().replace('\n', '').split() else: txt = data[key].replace('\n', '').split() new_txt = [] for word in txt: try: if feature == 'bow': new_txt.append(w2idxl[word]) else: new_txt.append(w2idx[word]) except: new_txt.append(0) data[key] = new_txt return data out = parse_dataset(data, feature) return w2idx, w2vec, out
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries 4- 返回所有词语的向量的拼接结果 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() # 获取keys集合,字典的单词集合 gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 获取word_index=>index集合 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 获取word=>词向量集合 w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] sentences = sentence.split(' ') for word in sentences: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # pad 补上0 combined = sequence.pad_sequences(combined) global input_length input_length = len(combined[0]) return w2indx, w2vec, combined else: print('error: 模型或者和并集合combined 为空')
def get_train_data(filename_list, target_qq=[], min_len=4): for filename in filename_list: if not os.path.isfile(filename): return current_qq = '--' is_target = False chat_record = [] for filename in filename_list: with open(filename, 'r', encoding='utf-8') as history: for line in history: line = line.strip().replace('\n', '') for word in stopword: line = line.replace(word, '') line = line.lower() if line.find('http') >= 0: continue header = HEADER.match(line) if not header: header = HEADER_MAIL.match(line) if header: _, current_qq = header.groups() if not target_qq: is_target = True if current_qq in qq_filter else False else: is_target = current_qq in target_qq elif is_target and line: line = standalize(line) record = list(jieba.cut(line)) + ['<eos>'] record = [word for word in record if word != ' '] if len(record) <= min_len: continue chat_record.append(record) if len(chat_record) >= 100000: break dict = Dictionary(chat_record) return dict, chat_record
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): #词典Dictionary(),词向量表model.vocab.keys(), gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1-创建索引映射的单词 2-创建一个单词到矢量映射 3-转换训练和测试词典 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('没有提供数据...')
def create_dictionaries(model=None, combined=None): """ Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(reduce(lambda x, y: x + y, combined), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries 4- 返回所有词语的向量的拼接结果 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() # keys gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] sentences = sentence.split(' ') for word in sentences: try: #word = np.unicode(word, errors='ignore') new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # combined = sequence.pad_sequences(combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 combined = sequence.pad_sequences( combined) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def build_vocab( self, documents_tokens: List[List[str]]) -> Tuple[List[List[str]], Dict]: """ Build vocabualry. :param documents_tokens: documents as list of tokens, e.g. [ ['the', 'brown', 'fox'], ['another', 'word', ..], ... ] :returns: a tuple consisting of list of documents as word counts (Bag-of-words), and Id2Word dictionary. """ LOGGER.info('Fitting bigram model..') bigram = Phrases(documents_tokens, min_count=self.min_df, threshold=100, progress_per=100, common_terms=self.stop_words) self.bigram_model = Phraser(bigram) LOGGER.info('Fitting trigram model..') self.trigram_model = Phraser( Phrases(bigram[documents_tokens], threshold=100)) documents_trigrams = [] LOGGER.info('Creating trigrams..') for index in range(len(documents_tokens) - 1, -1, -1): documents_trigrams.append( self.create_trigrams(documents_tokens[index])) documents_tokens.pop() id2word = Dictionary(documents_trigrams) return [id2word.doc2bow(text) for text in documents_trigrams], id2word
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # the index of a word which have word vector is not 0 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # integrate all the corresponding word vectors into the word vector matrix w2vec = {word: model[word] for word in w2indx.keys()} # a word without a word vector is indexed 0,return the index of word def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in list(sentence): try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # unify the length of the sentence with the pad_sequences function of keras combined = sequence.pad_sequences(combined, maxlen=maxlen) # return index, word vector matrix and the sentence with an unifying length and indexed return w2indx, w2vec, combined else: print('No data provided...')
def buildDict(self, no_below=3, no_above=0.7, keep_n=5000): from gensim.corpora.dictionary import Dictionary if 'GENSIM_DICT' in self.config: no_below = int(self.config['GENSIM_DICT'].get('no_below', 3)) no_above = float(self.config['GENSIM_DICT'].get('no_above', 0.7)) keep_n = int(self.config['GENSIM_DICT'].get('keep_n', 5000)) ori_pp_mode = copy.deepcopy(self.postProcessor.postProcessMethod) ori_go_postprocess = copy.deepcopy(self.goPoseprocessor) self.postProcessor.postProcessMethod = 'postProcess4Dict' self.goPoseprocessor = True self._reset_iter() #print(next(self)) self.gensim_dict = Dictionary(self) self._reset_iter() self.postProcessor.postProcessMethod = ori_pp_mode self.goPoseprocessor = ori_go_postprocess self.gensim_dict.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) if self.postProcessor: self.postProcessor.gensim_dict = self.gensim_dict
def preprocessingGensim(observations): logging.info('Begin preprocessingGensim') observations['tf-idf'] = "" # Create a Corpus dictionary = Dictionary(observations["lemmatized"].tolist()) corpus = [dictionary.doc2bow(text) for text in observations['lemmatized'].tolist()] # Create a new TfidfModel using the corpus tfidf = TfidfModel(corpus) for index, row in observations.iterrows(): tfidf_weights = tfidf[corpus[index]] sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True) observations.loc[index, 'tf-idf'] = sorted_tfidf_weights logging.info('End preprocessingGensim') return observations
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None): if refer_dictionary is None: refer_docs = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] refer_dictionary = Dictionary(refer_docs) refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs] refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0) doc = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc] doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s] for (sample, doc_vec) in zip(dataset, doc_vecs): for topic_prob in doc_vec: sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1] return refer_dictionary, refer_lda_model
def __init__(self, fname, processes=None, lemmatize=utils.HAS_PATTERN, dictionary=None): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. """ self.fname = fname if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def topic_list(text): print 'Topic modeling...' tokenizer = RegexpTokenizer('\w+') document = [] for token in tokenizer.tokenize(text): word = token.lower() if word not in stop_words: document.append(word) documents = [document] dic = Dictionary(documents) corpus = [dic.doc2bow(doc) for doc in documents] lda = LdaModel(corpus, num_topics=5) topics = [ dic[int(id)] for topic in lda.show_topics(formatted=False) for prob, id in topic ][:5] print topics return topics
def create_corpus(documents, field='text', normalizing='lemmatize', language=DEFAULTLANGUAGE): """ :param documents: an iterable of documents (dictionaries) :param field: the field from which to extract data :param normalizing: if 'lemmatize' then perfoms word net lemmatization with the default pos noun ('n') NOTE: only supported for english if 'stem' perform stemming with the porter stemmer else uses the input words as they are. """ print('Creating corpus ...') print('caching token represetation from documents ...') token_lists = [[ word for word in generate_word( doc_data, normalize=normalizing, language=language) ] for doc_data in get_data_generator(documents, field=field)] vocabulary = Dictionary(token_lists) corpus = [vocabulary.doc2bow(token_list) for token_list in token_lists] # gensim.corpora.MmCorpus.serialize('/tmp/lda.mm', corpus) return vocabulary, corpus
def get_corpus_dictionary(): """Crafts a toy corpus and the dictionary associated.""" corpus = [ ['carrot', 'salad', 'tomato'], ['carrot', 'salad', 'dish'], ['tomato', 'dish'], ['tomato', 'salad'], ['car', 'break', 'highway'], ['highway', 'accident', 'car'], ['moto', 'break'], ['accident', 'moto', 'car'] ] dictionary = Dictionary(corpus) # Transforming corpus with dictionary. corpus = [dictionary.doc2bow(doc) for doc in corpus] # Building reverse index. for (token, uid) in dictionary.token2id.items(): dictionary.id2token[uid] = token return corpus, dictionary
def buildDic(self, model=None, words=None): ''' 构建词典, :param model: word2vec模型 :param words: 结巴分词后所有的文本内容 :return: 返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引) ''' if (model is not None) and (words is not None): # 初始化一个词典 dict = Dictionary() # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次 # 转换为词袋模型 dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词, w2indx = {v: k + 1 for k, v in dict.items()} # key 是单词,value 是对应的词向量 w2vec = {word: model[word] for word in w2indx.keys()} # 获取一句话所对应的词语索引 def parseDataset(words): data = [] for sentence in words: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parseDataset(words) # 对长短不同的时序统一维度。 combined = sequence.pad_sequences(combined, maxlen=self.maxlen) return w2indx, w2vec, combined else: print("模型或数据导入失败")
def preprocess(tweets): # Get only negative ones (for this task) newTweets = tweets.copy() newTweets = remove_airline_tags(newTweets) newTweets.text = remove_links(newTweets.text) newTweets.text = lt_gt_conversion( ampersand_conversion(arrow_conversion(newTweets.text))) newTweets.text = with_without_conversion(newTweets.text) newTweets.text = hashtag_to_words(newTweets.text) newTweets = translate_all_emoji(newTweets) newTweets.text = remove_contractions(newTweets.text) newTweets.text = remove_punctuation(newTweets.text) newTweets.text = lemmatize_texts(newTweets.text) newTweets.text = remove_stopwords(newTweets.text) newTweets.text = newTweets.text.str.lower() texts = newTweets["text"].values # Tokenize and remove short words or filtered words tokenized_texts = [] for text in texts: split_text = text.split() split_text = [ word for word in split_text if len(word) > 2 and word not in FILTERED_WORDS ] tokenized_texts.append(split_text) # Create a dictionary for each word, and a bag of words text_dictionary = Dictionary(tokenized_texts) # Remove words that appear in over 50%, or less than 0.5%, and keep the top 66% of the vocabulary text_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=len(text_dictionary) // 2) text_corpus = [text_dictionary.doc2bow(text) for text in tokenized_texts] return (text_dictionary, text_corpus)
def create_dictionaries(model=None, combined=None): ''' 这个函数做3件事 1- 创建一个单词到索引的映射 2- 创建一个单词到词向量的映射 3- 对训练集和测试集的词典进行转换 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 词频小于10->0 所以v->k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用, 把combined中的词语转换成对应的索引 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # 词频小于10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' 单词变集合 ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 f12.write(str(combined)) f12.write('\n') return w2indx, w2vec, combined else: print('没有提供数据...')
def main(): articles_path = '/texts_corrected/*.txt' stopword_path = '/stopwords.txt' resultspath = '/results/' location_path = '/locations.txt' tot_topic_vectors_path = resultspath + 'time200msc_topic_vectors_beta0_1.csv' tot_topic_mixtures_path = resultspath + 'time200msc_topic_mixtures_beta0_1.csv' tot_topic_shapes_path = resultspath + 'time200msc_topic_shapes_beta0_1.csv' tot_pickle_path = resultspath + 'time200iter_beta0_1.pickle' coherence_pickle_path = resultspath + 'coherence.pickle' seed_file = resultspath + '/seedwords.txt' tot = stot_model() articles,date,vocab = tot.initDataset(articles_path, stopword_path, location_path) ##save variable for coherence measures dictionary = Dictionary(articles) corpus = [dictionary.doc2bow(article) for article in articles] coherence_pickle = open(coherence_pickle_path, 'wb') pickle.dump(dictionary, coherence_pickle) pickle.dump(corpus, coherence_pickle) coherence_pickle.close() #resume with modelling process tot.init_seedwords(seed_file, vocab) param = tot.initParam(articles, date, vocab) theta,phi,psi = tot.TopicsOverTimeGibbsSampling(param) np.savetxt(tot_topic_vectors_path, phi, delimiter=',') np.savetxt(tot_topic_mixtures_path, theta, delimiter=',') np.savetxt(tot_topic_shapes_path, psi, delimiter=',') tot_pickle = open(tot_pickle_path, 'wb') pickle.dump(param, tot_pickle) tot_pickle.close()
def create_dictionaries(train=None, test=None, model=None): if (train is not None) and (model is not None) and (test is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(data): for key in data.keys(): txt = data[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data[key] = new_txt return data train = parse_dataset(train) test = parse_dataset(test) return w2indx, w2vec, train, test else: print('No data provided...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items() } # 所有频数超过10的词语的索引,(k->v)=>(v->k) f = open("word2index.txt", 'w', encoding='utf8') for key in w2indx: f.write(str(key)) f.write(' ') f.write(str(w2indx[key])) f.write('\n') f.close() w2vec = {word: model[word] for word in w2indx.keys() } # 所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data # word=>index combined = parse_dataset(combined) # [[1,2,3...],[]] combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def buildDict(self): batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary = Dictionary(batchiter) print(len(common_dictionary)) if self.testReaderargs: print('update vocab from test set') batchiter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary.add_documents(batchiter) print(len(common_dictionary)) common_dictionary.filter_extremes(no_below=self.dict_no_below, no_above=self.dict_no_above, keep_n=self.dict_keep_n) self.dictProcess = DictionaryProcess(common_dictionary) self.postProcessor.dictProcess = self.dictProcess self.vocab_dim = len(self.dictProcess) self.have_dict = True if 1: count_list = [] self.trainDataIter._reset_iter() batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) for item in batchiter: current_count = sum(item) count_list.append(current_count) #print(current_count) print(sum(count_list) / len(count_list))
def __init__(self, data=None, dictionary=None): """ initialize, data should be provided, only when unpickling class object it is not needed!""" self.data = data self.model = None self.num_topics = None self.iterations = None self.random_state = None self.dictionary = dictionary if self.data is not None: if self.dictionary is None: self.dictionary = Dictionary(self.data) self.corpus = [self.dictionary.doc2bow(text) for text in self.data] else: self.dictionary = None self.corpus = None self.distributed = None self.chuncksize = None self.passes = None self.update_every = None self.alpha = None self.eta = None self.decay = None self.offset = None self.eval_every = None self.gamma_threshold = None self.minimum_probability = None self.ns_conf = None self.minimum_phi_value = None self.per_word_topics = None self.num_topics = None self.iterations = None self.random_state = None self.model = None self.coherence_model = None self.coherence = None self.coherence_type = None