def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data=[] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined=parse_dataset(combined) combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec,combined else: print 'No data provided...'
class WordCorpus(BaseCorpus): """\ Wrapper around a `gensim.corpora.dictionary.Dictionary`. This is a light-weight alternative to `CableCorpus` to create an initial word dictionary:: wd = WordCorpus() wd.add_text('ref-1', 'bla bla') # add more texts wd.dct.filter_extremes() corpus = CableCorpus('/my/directory/', wd.dct) corpus.add_text('ref-1', 'bla bla') # add more texts corpus.close() """ def __init__(self, dct=None, tokenizer=None): """\ Initializes the wrapper. `dct` An existing Dictionary or ``None`` if a new Dictionary should be created (default) `tokenizer` A tokenizer function or ``None``, see `BaseCorpus` """ super(WordCorpus, self).__init__(tokenizer) self.dct = Dictionary() if dct is None else dct def add_words(self, reference_id, words): self.dct.doc2bow(words, True)
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None): ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must be a 3-tuple of the picklefile names in the following order: (title, body, tags) If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved. ''' utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary() for eid in xrange(n): for row in row_stream(splits_template % eid): ID, title, body, tags = row utitledict.doc2bow(title.split(), allow_update=True) ubodydict.doc2bow(body.split(), allow_update=True) utagdict.doc2bow(tags.split(), allow_update=True) assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs print "Before filtering..." print "utitledict:", utitledict print "ubodydict:", ubodydict print "utagdict:", utagdict if save_pickle_tup: assert len(save_pickle_tup) == 3 if save_pickle_tup[0]: print "saving utitledict..." utitledict.save(save_pickle_tup[0]) if save_pickle_tup[1]: print "saving ubodydict..." ubodydict.save(save_pickle_tup[1]) if save_pickle_tup[2]: print "saving utagdict..." utagdict.save(save_pickle_tup[2]) return (utitledict, ubodydict, utagdict)
def doc_to_gensim(doc, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document. Args: doc (``spacy.Doc``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list((int, int)): bag-of-words document, a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() words = extract.words(doc, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) if lemmatize is True: gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True) else: gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True) return (gdict, gdoc)
def create_dictionaries(train=None, test=None, model=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (train is not None) and (model is not None) and (test is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k+1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(data): ''' Words become integers ''' for key in data.keys(): txt = data[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data[key] = new_txt return data train = parse_dataset(train) test = parse_dataset(test) return w2indx, w2vec, train, test else: print('No data provided...')
def _load_vocab(self,fname): logging.info("loading plain-text file:{}".format(fname)) src_file = codecs.open(fname, 'rb', 'utf-8') dictionary = Dictionary() num_instances = 0 for term in src_file: dictionary.doc2bow(term.strip().lower().encode('utf-8').split(), allow_update=True) num_instances += 1 logging.info("processed {} instances".format(num_instances)) self.dictionary = dictionary
def get_corpus_dictionary(): """Crafts a toy corpus and the dictionary associated.""" # Toy corpus. corpus = [ ['carrot', 'salad', 'tomato'], ['carrot', 'salad', 'dish'], ['tomato', 'dish'], ['tomato', 'salad'], ['car', 'break', 'highway'], ['highway', 'accident', 'car'], ['moto', 'break'], ['accident', 'moto', 'car'] ] dictionary = Dictionary(corpus) # Transforming corpus with dictionary. corpus = [dictionary.doc2bow(doc) for doc in corpus] # Building reverse index. for (token, uid) in dictionary.token2id.items(): dictionary.id2token[uid] = token return corpus, dictionary
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None, headlines=None): """Generate word:index, word:vector, index:word dictionaries. Args: ---- wrd_embedding: gensim.models.word2vec.Word2Vec fitted model filter_corpus (optional): boolean Filter the corpus to only those words seen in the bodies/headlines. bodies (optional): list of lists Must be passed in if `filter_corpus` is True. headlines (optional): list of lists Must be passed in if `filter_corpus` is True. Return: ------ word_idx_dct: dict idx_word_dct: dict word_vector_dct: dict """ if filter_corpus: if (not bodies or not headlines): excep_str = "Must pass in bodies and headlines with filter_corpus True!" raise Exception(excep_str) else: wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding) gensim_dct = Dictionary() gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True) # Leave index 0 for the newline character word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()} idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()} word_idx_dct['\n'] = 0 idx_word_dct[0] = '\n' word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()} vec_dim = next(len(value) for value in word_vector_dct.values()) word_vector_dct['\n'] = np.zeros((vec_dim)) return word_idx_dct, idx_word_dct, word_vector_dct
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None): """Generate word:index, word:vector, index:word dictionaries. Args: ---- wrd_embedding: gensim.models.word2vec.Word2Vec fitted model reviews (optional): np.array (or array-like) of lists of strings Used to filter the vocabulary, either to only those words in `reviews` or the most common `vocab_size` words in `reviews` that are also in the `wrd_embedding`. vocab_size (optional): int Keep only `vocab_size` most common words from the reviews. Return: ------ word_idx_dct: dict idx_word_dct: dict word_vector_dct: dict """ if reviews is not None: wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size) gensim_dct = Dictionary() gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True) # Leave index 0 for masking the padding, 1 for the end of sequence # character (EOS), and 2 for unkown words (denoted 'UNK') wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()} idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()} wrd_idx_dct['EOS'] = 1 idx_wrd_dct[1] = 'EOS' wrd_idx_dct['UNK'] = 2 idx_wrd_dct[2] = 'UNK' wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()} embedding_dim = wrd_embedding.vector_size wrd_vector_dct['EOS'] = np.zeros((embedding_dim)) wrd_vector_dct['UNK'] = np.zeros((embedding_dim)) return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct
class tip_rec: def __init__(self, num_topics = 15): self.numtopics = num_topics self.topic_dict = dict(enumerate(np.zeros(num_topics))) self.user_dict = {} self.model = None self.worddict = {} self.mydict = None def train(self, df): self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()} cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict=dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j]/norm_const def predict(self, text, username = ''): topics = self.model[self.mydict.doc2bow(text.split())] doc_aff = np.zeros(self.numtopics) for i in topics: doc_aff[i[0]] = i[1] if username == '': returndict = {} for user in self.user_dict.keys(): user_aff = np.array(self.user_dict[user].values()) score = np.linalg.norm(user_aff - doc_aff) returndict[user] = score return returndict else: user_aff = np.array(self.user_dict[username].values()) score = np.linalg.norm(user_aff - doc_aff) return (username, score)
def cluster_questions(topic_num, res_path, q_path='datasets\DialogQA\Qall.txt', a_path='datasets\DialogQA\Aall.txt'): with open(a_path, 'r', encoding='utf-8') as f: common_texts = [text.split() for text in f.readlines()] with open(q_path, 'r', encoding='utf-8') as f: questions = [text for text in f.readlines()] common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=topic_num) questions_clusterd = [[] for i in range(topic_num)] print('Questions : ', len(questions)) perp = lda.log_perplexity(common_corpus) for i, q in enumerate(questions): other_corpus = [common_dictionary.doc2bow(common_texts[i])] vector = lda[other_corpus] # print(vector[0]) max_prob = 0 for (idx, prob) in vector[0]: # print(idx) if prob > max_prob: topic = idx max_prob = prob questions_clusterd[topic].append(q) # print(topic) if (not os._exists(res_path)): os.makedirs(res_path) for top in range(topic_num): with open(res_path + str(top) + '.txt', 'w', encoding='utf-8') as f: for quest in questions_clusterd[top]: f.write(quest) # f.write('\n') return perp
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): #词典Dictionary(),词向量表model.vocab.keys(), gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
class LDATextEncoder(TextEncoder): def __init__(self, language="english", encoding_length=20): self.name = "LDA" self.model = None self.num_topics = encoding_length self.dictionary = None super().__init__(language=language, encoding_length=encoding_length) def fit(self, docs): docs = self.preprocess_docs(docs) self.dictionary = Dictionary(docs) corpus = [self.dictionary.doc2bow(doc) for doc in docs] self.model = LdaModel(corpus, id2word=self.dictionary, num_topics=self.num_topics, minimum_probability=0.0) return self def transform(self, docs): docs = self.preprocess_docs(docs) docs = [self.dictionary.doc2bow(doc) for doc in docs] return np.array([self.model[doc] for doc in docs])[:, :, 1]
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries 4- 返回所有词语的向量的拼接结果 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() # 获取keys集合,字典的单词集合 gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 获取word_index=>index集合 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 获取word=>词向量集合 w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] sentences = sentence.split(' ') for word in sentences: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # pad 补上0 combined = sequence.pad_sequences(combined) global input_length input_length = len(combined[0]) return w2indx, w2vec, combined else: print('error: 模型或者和并集合combined 为空')
def gensim_lda(pd_df_yelp, text_rev): #gensim lda common_dict = Dictionary(text_rev) common_corpus = [common_dict.doc2bow(text) for text in text_rev] lda = LdaModel(common_corpus) topics = [lda.get_document_topics(doc) for doc in common_corpus] topicIDs = [topic[0][0] for topic in topics] topic_prob_list = [lda.show_topic(topicID) for topicID in topicIDs] topic_prob_list_split = [zip(*item) for item in topic_prob_list] topic_prob_list_words = [list(map(lambda topID: dict(common_dict)[int(topID)],item[0]))\ for item in topic_prob_list_split] topic_prob_list_prob = list( map(lambda item: list(item[1]), topic_prob_list_split)) return (topic_prob_list_words, topic_prob_list_prob)
def create_dictionaries(data, model, feature): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2idx = {v: k + 1 for k, v in gensim_dict.items()} w2idxl = {v.lower(): k + 1 for k, v in gensim_dict.items()} #w2vec = {word: model[word.lower()] for word in w2idx.keys()} w2vec = {} for word in w2idx.keys(): if feature == 'bow': try: w2vec[word.lower()] = model[word] except KeyError: w2vec[word.lower()] = [0] * model.vector_size else: try: w2vec[word] = model[word] except KeyError: w2vec[word] = [0] * model.vector_size def parse_dataset(data, feature): for key in data.keys(): if feature == 'bow': txt = data[key].lower().replace('\n', '').split() else: txt = data[key].replace('\n', '').split() new_txt = [] for word in txt: try: if feature == 'bow': new_txt.append(w2idxl[word]) else: new_txt.append(w2idx[word]) except: new_txt.append(0) data[key] = new_txt return data out = parse_dataset(data, feature) return w2idx, w2vec, out
def get_topic_words(sent, stop_words, cnt=15): sent = re.sub(r'[\r\n]', '', sent) wlst = jieba.lcut(sent) ls = [] for w in wlst: if w not in stop_words: ls.append(w) di = Dictionary([ls]) corpus = [di.doc2bow(text) for text in [ls]] lda = LdaModel(corpus, id2word=di, num_topics=1) tp = lda.print_topics(num_words=cnt)[0][1] return re.findall('"(.+?)"', tp)
def train_model_lda_gensim(): # 把文章转成list common_dictionary = Dictionary(common_texts) print(type(common_texts)) print(common_texts[0]) # 把文本转成词袋形式 common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # 调用lda模型,并指定10个主题 lda = LdaModel(common_corpus, num_topics=10) # 检查结果 lda.print_topic(1, topn=2)
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1-创建索引映射的单词 2-创建一个单词到矢量映射 3-转换训练和测试词典 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('没有提供数据...')
def create_dictionaries(model=None, combined=None): """ Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(reduce(lambda x, y: x + y, combined), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries 4- 返回所有词语的向量的拼接结果 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() # keys gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过10的词语的词向量 def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] sentences = sentence.split(' ') for word in sentences: try: #word = np.unicode(word, errors='ignore') new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # combined = sequence.pad_sequences(combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 combined = sequence.pad_sequences( combined) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # the index of a word which have word vector is not 0 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # integrate all the corresponding word vectors into the word vector matrix w2vec = {word: model[word] for word in w2indx.keys()} # a word without a word vector is indexed 0,return the index of word def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in list(sentence): try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) # unify the length of the sentence with the pad_sequences function of keras combined = sequence.pad_sequences(combined, maxlen=maxlen) # return index, word vector matrix and the sentence with an unifying length and indexed return w2indx, w2vec, combined else: print('No data provided...')
def keywords(corpus): docs=[preprocess(doc) for doc in corpus] dictionary = Dictionary(docs) c = [dictionary.doc2bow(doc) for doc in docs] tfidf = TfidfModel(c) result=[] for s in c: tfidf_weights = tfidf[s] r=[] sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True) for term_id, weight in sorted_tfidf_weights: r.append([dictionary.get(term_id), weight]) result.append(r) return result
def lda(domain): common_texts = normalize(domain=domain) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=2, per_word_topics=True, id2word=common_dictionary) # print(common_dictionary.token2id) return lda
def lda_vector(dataset: list, refer_dictionary=None, refer_lda_model=None): if refer_dictionary is None: refer_docs = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] refer_dictionary = Dictionary(refer_docs) refer_doc2bow = [refer_dictionary.doc2bow(text) for text in refer_docs] refer_lda_model = LdaModel(corpus=refer_doc2bow, id2word=refer_dictionary, num_topics=10, dtype=np.float64, passes=10, minimum_probability=0.0) doc = [ [token for (i, token) in enumerate(sample['essay_lemma']) if sample['essay_is_stop'][i] is False and token not in [',', '.', '?']] for sample in dataset ] doc_bow_s = [refer_dictionary.doc2bow(text) for text in doc] doc_vecs = [refer_lda_model[doc_bow] for doc_bow in doc_bow_s] for (sample, doc_vec) in zip(dataset, doc_vecs): for topic_prob in doc_vec: sample['topic'+str(topic_prob[0] + 1)] = topic_prob[1] return refer_dictionary, refer_lda_model
def buildDic(self, model=None, words=None): ''' 构建词典, :param model: word2vec模型 :param words: 结巴分词后所有的文本内容 :return: 返回每个词语的索引(词语-索引),词向量(词语-向量),以及每个句子所对应的词语索引(下标索引) ''' if (model is not None) and (words is not None): # 初始化一个词典 dict = Dictionary() # model.vocab.keys() 为 word2vec 中所有的词,设置 allow_update=True 则每个词出现一个,频率就会增加一次 # 转换为词袋模型 dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 重新生成字典:key 是单词,value 是单词对应的下标。其中 k 为下标索引,v 为 字典中包含的词, w2indx = {v: k + 1 for k, v in dict.items()} # key 是单词,value 是对应的词向量 w2vec = {word: model[word] for word in w2indx.keys()} # 获取一句话所对应的词语索引 def parseDataset(words): data = [] for sentence in words: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parseDataset(words) # 对长短不同的时序统一维度。 combined = sequence.pad_sequences(combined, maxlen=self.maxlen) return w2indx, w2vec, combined else: print("模型或数据导入失败")
def create_dictionaries(model=None, combined=None): ''' 这个函数做3件事 1- 创建一个单词到索引的映射 2- 创建一个单词到词向量的映射 3- 对训练集和测试集的词典进行转换 ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # 词频小于10->0 所以v->k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用, 把combined中的词语转换成对应的索引 ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # 词频小于10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 所以k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items() } #所有频数超过10的词语的索引,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys() } #所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): # 闭包-->临时使用 ''' 单词变集合 ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=maxlen) #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 f12.write(str(combined)) f12.write('\n') return w2indx, w2vec, combined else: print('没有提供数据...')
def word2vec_train(tokenizedtalkfile, vocabularyfile): wordlist = [] for line in open(tokenizedtalkfile, 'r'): talkwords = [] for word in line.split(' '): if word.find('\n') != -1: word = word.replace('\n', '') talkwords.append(word) wordlist.append(talkwords) print('Start Training ...') start = time.time() model = Word2Vec(size=50, min_count=1, window=7, workers=4, sg=1, iter=5) model.build_vocab(wordlist) model.train(wordlist) model.save('corpus_word2vec_model.pkl') end = time.time() print('Training Time: %.5f' % (end - start)) model = Word2Vec.load('corpus_word2vec_model.pkl') gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) word2index = {v: k for k, v in gensim_dict.items()} with open(vocabularyfile, 'w') as vocabFile: for item in word2index.keys(): vocabFile.write(item + '\t' + str(word2index[item]) + '\n')
def topic_extraction(corpus, ntopics): # gensim lda common_dictionary = Dictionary(corpus) common_corpus = [common_dictionary.doc2bow(text) for text in corpus] lda = LdaModel(common_corpus, num_topics=ntopics, iterations=800, random_state=1) features = lda.get_document_topics(common_corpus, minimum_probability=0) lda_list = [] for f in features: lda_list.append([b[1] for b in f]) lda_df = pd.DataFrame(lda_list) lda_df = lda_df.reset_index(drop=True) return lda_df
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None, headlines=None): """Generate word:index, word:vector, index:word dictionaries. Args: ---- wrd_embedding: gensim.models.word2vec.Word2Vec fitted model filter_corpus (optional): boolean Filter the corpus to only those words seen in the articles. Use to speed up iteration during intial building/training phases. bodies (optional): list of lists Must be passed in if `filter_corpus` is True. headlines (optional): list of lists Must be passed in if `filter_corpus` is True. Return: ------ word_idx_dct: dict idx_word_dct: dict word_vector_dct: dict """ if filter_corpus: if (not bodies or not headlines): raise Exception('Must pass in bodies and headlines with filter_corpus as True!') else: wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding) gensim_dct = Dictionary() gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True) word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()} idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()} word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()} return word_idx_dct, idx_word_dct, word_vector_dct
def create_dictionaries(model=None, combined=None): ''' Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries ''' if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) # freqxiao10->0 ,so k+1 w2indx = {v: k + 1 for k, v in gensim_dict.items()} # all index of word with freq>10,(k->v)=>(v->k) w2vec = {word: model[word] for word in w2indx.keys()} # all index of word vectors with freq>10, (word->model(word)) def parse_dataset(combined): ''' Words become integers ''' data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) # freqxiao10->0 data.append(new_txt) return data # word=>index combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) # index to every word in every sentence, when freq < 10, index = 0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(cls, model=None, combined=None): """ Function does are number of Jobs: 1- Creates a word to index mapping 2- Creates a word to vector mapping 3- Transforms the Training and Testing Dictionaries """ def _parse_dataset(sentences): """Words become integers 将每一个句子中的每个词用词向量存在的词的索引表示出来, 如果词没有在索引中出现,则标为0 """ data = [] for sentence in sentences: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except KeyError: new_txt.append(0) data.append(new_txt) return data if combined is not None and model is not None: gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 所有频数超过5的词语的索引 w2vec = {word: model[word] for word in w2indx.keys()} # 所有频数超过5的词语的词向量 combined = _parse_dataset(combined) combined = sequence.pad_sequences( combined, maxlen=cls.maxlen) # 每个句子所含词语对应的索引,所有句子中含有频数小于5的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def create_dictionaries(model=None, combined=None): #创造辞典 1-创建单词到索引的映射 2-创建单词到矢量的映射 3-转换培训和测试词典 if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # word => index 词的索引 f = open("../model/word2index.txt", 'w', encoding='utf8') #word2index,txt文件是如何生成的? for key in w2indx: f.write(str(key)) f.write(' ') f.write(str(w2indx[key])) f.write('\n') f.close() w2vec = {word: model[word] for word in w2indx.keys()} # word => vector def parse_dataset(combined): # 解析数据集 闭包(函数内部的函数)临时使用 data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data # word => index combined = parse_dataset(combined) combined = sequence.pad_sequences(combined, maxlen=maxlen) # 句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def get_lda_model_byDomains(domains): """ Создать LDA модель из заданных ссылок :param domains: имена сообществ VK """ common_texts = normilize_texts(domains[0]) for i in range(1, len(domains)): common_texts += normilize_texts(domains[i]) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=len(domains)) return lda
def transform_data(model, x_train, y_train, x_test, y_test): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_data(x, y): for key in range(len(y)): txt = x[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) x[key] = new_txt return x, y x_train, y_train = parse_data(x_train, y_train) x_test, y_test = parse_data(x_test, y_test) return w2indx, w2vec, x_train, y_train, x_test, y_test
def pre_process_lda(data_train): stoplist = load_stopwords(stopword_path) text_data = [] for document in data_train: doc = document.lower().strip() words = tokenizer.tokenize(doc) docs = [ word for word in words if (word not in stoplist and len(word) > 1) ] text_data.append(docs) dictionary = Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] return corpus, dictionary
def create_dictionaries(train=None, test=None, model=None): if (train is not None) and (model is not None) and (test is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} w2vec = {word: model[word] for word in w2indx.keys()} def parse_dataset(data): for key in data.keys(): txt = data[key].lower().replace('\n', '').split() new_txt = [] for word in txt: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data[key] = new_txt return data train = parse_dataset(train) test = parse_dataset(test) return w2indx, w2vec, train, test else: print('No data provided...')
def create_dictionaries(model=None, combined=None): if (combined is not None) and (model is not None): gensim_dict = Dictionary() gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items() } # 所有频数超过10的词语的索引,(k->v)=>(v->k) f = open("word2index.txt", 'w', encoding='utf8') for key in w2indx: f.write(str(key)) f.write(' ') f.write(str(w2indx[key])) f.write('\n') f.close() w2vec = {word: model[word] for word in w2indx.keys() } # 所有频数超过10的词语的词向量, (word->model(word)) def parse_dataset(combined): data = [] for sentence in combined: new_txt = [] for word in sentence: try: new_txt.append(w2indx[word]) except: new_txt.append(0) data.append(new_txt) return data # word=>index combined = parse_dataset(combined) # [[1,2,3...],[]] combined = sequence.pad_sequences( combined, maxlen=maxlen) # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0 return w2indx, w2vec, combined else: print('No data provided...')
def __init__(self, strategy="GREEDY", seed=2020, max_iter=20): """ This class produces a baseline BM25 ranking and uses LDA topic modelling in combination with the general re-ranking procedure of Huang and Hu (2009) """ self.seed = seed self.max_iter = max_iter self.utils = Utils() # Amount of documents to rank and rerank self.N= 100 # Select a strategy for weighing final topics self.strategy = strategy # K to use in TOP-K-AVG strategy self.top_k = 10 # TODO ideally we don't want to first rank every time for the reranking self.baseline = BaselineBM25(k=self.N) self.baseline.rank() # For each topic, the system outputs N retrieved articles. self.batch_hits = self.baseline.get_batch_hits() # Read index to retrieve document contents # N.B. the `contents` field is currently empty; we stored "raw" instead. self.index_loc = self.baseline.get_index_loc() reader = IndexReader(self.index_loc) # Vocabulary in index #vocabulary = [ term.term for term in reader.terms()] #print(f"{len(vocabulary)} terms in vocabulary") # Topics and the retrieved articles are represented as the keyword sequences self.topics = self.baseline.get_topics() self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } self.query_ids = self.baseline.get_query_ids() # Next line returns preprocessed documents per query docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() } # Prepare bag-of-words dataset for gensim self.X = defaultdict(list) for id in self.query_ids: dictionary = Dictionary(docs_per_query[id]) # Dictionary expects a list of lists, elements being lists of tokens self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
class DigestedDocumentCollection(CorpusABC): """A bag-of-words representation of a corpus (collection of documents). This serves as direct input to modeling functions. It is output from preprocessing functions. Parameters ---------- corpus: A collection of tokenized documents Each document is a list of tokens, tokenized and normalized strings (either utf8 or unicode) (e.g. output of topik.SimpleTokenizer) Readers iterate over tuples (id, content), but discard id in return (for compatibility with Gensim.) """ def __init__(self, tokenized_corpus): self.corpus = tokenized_corpus self.dict = Dictionary(tokenized_corpus.get_generator_without_id()) super(DigestedDocumentCollection, self).__init__() def __iter__(self): """Discards id field - for compatibility with Gensim.""" for _id, doc_tokens in self.corpus: yield self.dict.doc2bow(doc_tokens) def __len__(self): return len(self.corpus) def get_id2word_dict(self): return self.dict def save(self, filename): self.corpus.save(filename) @classmethod def load(cls, filename): return cls(load_persisted_corpus(filename)) @property def persistor(self): return self.corpus.persistor @property def filter_string(self): return self.corpus.filter_string
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] boolean_document_based = ['u_mass'] sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') def checkCoherenceMeasure(topics1, topics2, coherence): """Check provided topic coherence algorithm on given topics""" if coherence in boolean_document_based: cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence) cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence) else: cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
class WikiCorpus(interfaces.CorpusABC): """ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h >>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word """ def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None): """ Initialize the corpus. This scans the corpus once, to determine its vocabulary (only the first `keep_words` most frequent words that appear in at least `noBelow` documents are kept). """ self.fname = fname if dictionary is None: self.dictionary = Dictionary(self.getArticles()) self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words) else: self.dictionary = dictionary def __len__(self): return self.numDocs def __iter__(self): """ The function that defines a corpus -- iterating over the corpus yields vectors, one for each document. """ for docNo, text in enumerate(self.getArticles()): yield self.dictionary.doc2bow(text, allowUpdate = False) def saveDictionary(self, fname): """ Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. """ logger.info("saving dictionary mapping to %s" % fname) fout = open(fname, 'w') for token, tokenId in sorted(self.dictionary.token2id.iteritems()): fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId])) fout.close() @staticmethod def loadDictionary(fname): """ Load previously stored mapping between words and their ids. The result can be used as the `id2word` parameter for input to transformations. """ result = {} for lineNo, line in enumerate(open(fname)): cols = line[:-1].split('\t') if len(cols) == 2: wordId, word = cols elif len(cols) == 3: wordId, word, docFreq = cols else: continue result[int(wordId)] = word # docFreq not used return result def saveAsText(self, fname): """ Store the corpus to disk, in a human-readable text format. This actually saves two files: 1. Document-term co-occurence frequency counts (bag-of-words), as a Matrix Market file `fname_bow.mm`. 2. Token to integer mapping, as a text file `fname_wordids.txt`. """ self.saveDictionary(fname + '_wordids.txt') matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000) def getArticles(self): """ Iterate over the dump, returning text version of each article. Only articles of sufficient length are returned (short articles & redirects etc are ignored). """ articles, intext = 0, False for lineno, line in enumerate(bz2.BZ2File(self.fname)): if line.startswith(' <text'): intext = True line = line[line.find('>') + 1 : ] lines = [line] elif intext: lines.append(line) pos = line.find('</text>') # can be on the same line as <text> if pos >= 0: intext = False if not lines: continue lines[-1] = line[:pos] text = filterWiki(''.join(lines)) if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here articles += 1 yield tokenize(text) # split text into tokens self.numDocs = articles # cache corpus length
logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values()] sim_matrix = np.zeros((len(text_list), len(text_list))) logging.info('transform the textlist') text_list = lsi[log_ent[text_list]] logging.info('compute similarity matrix') for i, par1 in enumerate(text_list): for j, par2 in enumerate(text_list): sim_matrix[i, j] = matutils.cossim(par1, par2) matrices[key] = {} matrices[key]['term_list'] = term_list matrices[key]['sim_matrix'] = sim_matrix assert np.shape(sim_matrix)[0] == len(term_list)
term_lists = [] for i in range(len(df)): df['msg'][i] = df['msg'][i].lower() j = df['msg'][i].find('req') if j > -1: df['msg'][i] = df['msg'][i][j:] idx.append(i) terms = df['msg'][i].split() terms = terms[5:] filtered_terms = [t for t in terms if len(t) > 0] term_lists.append(filtered_terms) # Merge term lists into the main dataframe d = {'terms':term_lists} term_df = DataFrame(data=d,columns=['terms'],index=df.index[idx]) df = df.join(term_df) # Create corpus for topic modeling corpora_dict = Dictionary(term_lists) corpus = [corpora_dict.doc2bow(msg) for msg in term_lists] # Perform topic modeling lda = LdaModel(corpus=corpus,id2word=corpora_dict,num_topics=5) # Print out top terms for each topic topics = lda.show_topics() i = 0 for topic in topics: i += 1 print "Topic %d: %s" % (i,str(topic))
class CableCorpus(BaseCorpus): """\ The cable corpus consists of several files which are written into a directory. * a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle" * a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json" * a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm" CAUTION: The corpus overrides any existing files with the same file name in the specified directory. By default, the corpus creates the word dictionary and the vector space model which may lead into an unuseful vector space model. To filter certain words, the corpus may be initialized with a pre-generated word dictionary. To make the dictionary immutable, the property ``allow_dict_updates`` should be set to ``False`` (updates are allowed by default). The resulting vector space model contains only words which are in the word dictionary then. Example to reduce the clutter:: corpus = CableCorpus('/my/directory/') # Add some texts here corpus.add_text('ref-1', u'bla bla bla') corpus.add_text('ref-2', u'bla bla blub') ... corpus.dct.filter_extremes() corpus.close() from gensim.corpora.dictionary import Dictionary # Load previously created dict dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt') # Create another corpus with the previously word dict corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False) # Add some texts .... corpus.close() """ def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None): """\ Initializes the cable corpus. `path` Directory where the generated files are stored. `dct` An existing `gensim.corpora.dictionary.Dictionary` If it's ``None`` (default) a dictionary will be created. `tokenizer` A function to tokenize/normalize/clean-up/remove stop words from strings. If it's ``None`` (default), a default function will be used to tokenize texts. `allow_dict_updates` Indicats if unknown words should be added to the dictionary (default ``True``). `prefix` A prefix for the generated file names. """ super(CableCorpus, self).__init__(tokenizer) if not os.path.isdir(path): raise IOError('Expected a directory path') self.dct = Dictionary() if dct is None else dct self._path = path self._prefix = prefix or 'cables_' self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm')) self.allow_dict_updates = allow_dict_updates self._cables = [] def add_words(self, reference_id, words): self._cables.append(reference_id) self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates)) def close(self): self._mw.close() self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle')) json_filename = os.path.join(self._path, self._prefix + 'id2docid.json') json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))
class LDA(object): def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus = [[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model.update(new_corpus_data) def inference(self, document = []): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
class TextCorpus(interfaces.CorpusABC): """ Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. This is an abstract base class: override the `get_texts()` method to match your particular input. Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You must only provide a correct `get_texts` implementation. """ def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if input is not None: self.dictionary.add_documents(self.get_texts()) else: logger.warning("No input document stream provided; assuming " "dictionary will be initialized some other way.") def __iter__(self): """ The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for text in self.get_texts(): if self.metadata: yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1]) else: yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return getstream(self.input) def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. length = 0 for lineno, line in enumerate(getstream(self.input)): length += 1 yield utils.tokenize(line, lowercase=True) self.length = length def __len__(self): return self.length # will throw if corpus not initialized
class DefaultJsonCorpus(object): """ A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input. The methods provided by gensim TextCorpus are needed for the GenSim training. Any corpus provided to DocumentSimilarity should provide the methods given in this class. """ def __init__(self, input=None,create_dictionary=True): super(DefaultJsonCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if create_dictionary: self.dictionary.add_documents(self.get_texts()) def __iter__(self): for text in self.get_texts(): yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return utils.file_or_filename(self.input) def __len__(self): if not hasattr(self, 'length'): # cache the corpus length self.length = sum(1 for _ in self.get_texts()) return self.length def get_json(self): if isinstance(self.input,list): for j in self.input: yield j else: with self.getstream() as lines: for line in lines: line = line.rstrip() j = json.loads(line) yield j def get_texts(self,raw=False): """ yield raw text or tokenized text """ for j in self.get_json(): text = j["text"] if raw: yield text else: yield utils.tokenize(text, deacc=True, lowercase=True) def get_meta(self): """ return a json object with meta data for the documents. It must return: id - id for this document optional title and tags. Tags will be used as base truth used to score document similarity results. """ doc_id = 0 for j in self.get_json(): m = copy.deepcopy(j) m['id'] = long(m['id']) m['corpus_seq_id'] = doc_id doc_id += 1 yield m def get_dictionary(self): return self.dictionary