def get_tfidf_model(): if os.path.isfile(TFIDF_FILE): return TfidfModel.load(TFIDF_FILE) else: model = TfidfModel(get_corpus(), get_dictionary()) model.save(TFIDF_FILE) return model
def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora' ]) self.dictionary = corpora.Dictionary.load( os.path.join(corpora_folder, "%s.dict" % (vocabulary, ))) self.corpus = corpora.MmCorpus( os.path.join(corpora_folder, "%s.mm" % (vocabulary, ))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models' ]) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath)
def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus
def cal_tfidf(documents, topk=10) -> List: """ tfidf模型训练 :param documents: 要进行训练的文档 :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词 :return: """ # 单个文档分成列表 docs = [[word for word in document.split(' ')] for document in documents] # 生成字典 dictionary = corpora.Dictionary(docs) # 生成bag of word docs_bow = [dictionary.doc2bow(doc) for doc in docs] if os.path.isfile(tfidfmodel): model = TfidfModel.load(tfidfmodel) else: model = TfidfModel(docs_bow) model.save(tfidfmodel) # 生成文本向量 docs_vector = list(model[docs_bow]) # 对所有的文本向量进行排序,取钱topk docs_sort_vector = [ sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector ] # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表 docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc] for doc in docs_sort_vector] return docs_sort_chinese
class TFIDFmodel(object): def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora']) self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,))) self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath) def __contains__(self, item): return item in self.inner_model
def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path)
def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self
def buildTfidfModel(corpus): print('get tfidf model...') if not os.path.exists(modelpath + 'tfidf.model'): # 构造tfidf向量 tfidf = TfidfModel(corpus) tfidf.save(modelpath + 'tfidf.model') else: tfidf = TfidfModel.load(modelpath + 'tfidf.model') print('done') return tfidf
def fit(self, documents, labels=None): if self.lexicon == None or self.tfidf == None: inputDocuments = list(documents) self.lexicon = Dictionary(inputDocuments) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in inputDocuments], id2word=self.lexicon) self.save() return self else: return self
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
def __init__(self, docs, strip_diac=True, num_option=OPTION_GROUP, usr_option=OPTION_GROUP, url_option=OPTION_GROUP, emo_option=OPTION_GROUP, lc=True, del_dup1=True, token_list=[-1], lang=None, **kwargs): self.strip_diac = strip_diac self.num_option = num_option self.usr_option = usr_option self.url_option = url_option self.emo_option = emo_option self.emoclassifier = EmoticonClassifier() self.lc = lc self.del_dup1 = del_dup1 self.token_list = token_list if lang: self.lang = LangDependency(lang) else: self.lang = None self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'} docs = [self.tokenize(d) for d in docs] self.dictionary = corpora.Dictionary(docs) corpus = [self.dictionary.doc2bow(d) for d in docs] self.model = TfidfModel(corpus)
def calculate_embedding(corpus: Corpus, *, rank=2, svd_dims=50, perplexity=30, seed=0): """ Calculate a document embedding that assigns each document in the corpus a N-d position based on the word usage. :returns: A list of N-d tuples for the documents in the corpus. """ from gensim.models.tfidfmodel import TfidfModel from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE dic = corpus.dictionary freqs = corpus.frequencies tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T if svd_dims is not None: svd = TruncatedSVD(n_components=svd_dims, random_state=seed) components = svd.fit_transform(tfidf) else: components = tfidf model = TSNE(rank, metric='cosine', square_distances=True, perplexity=perplexity, random_state=seed) return model.fit_transform(components)
def transformModel(modelType, inputModel="", dictionary=""): #check if using default dict or lcoation passed as parameter if dictionary == "": dictionary = corpora.Dictionary.load('dictionaries/testNewsgroupsDictionary.dict') print dictionary #sys.exit(1) else: fileName = 'dictionaries/'+str(dictionary) dictionary = corpora.Dictionary.load(fileName) #use default stored model; mm format if inputModel == "": inputModel = TfidfModel.load("models/testNewsgroups.tfidf_model") #print inputModel else: fileName = 'models/'+str(inputModel) corpus = corpora.MmCorpus(inputModel) inputModel = models.TfidfModel(corpus) #create model handlers if modelType == "": print "Chose output model for selected input file: \n 1 -> LSI model\n 2 -> LDA model\n 3 -> LogEntropy model\n Pass it as the third parameter" sys.exit(1) elif modelType == 1: model = models.LsiModel(inputModel,id2word=dictionary) elif modelType == 2: model = models.LdaModel(inputModel,id2word=dictionary) elif type == 3: model = models.LogEntropyModel(inputModel,id2word=dictionary) else: errorMessage("Something went wrong with the type identificator") return model
def tf_idf_weight(spacy_contexts): """ @param spacy_contexts Spacy-fied contexts Returns list of Dicts, each dictionary corresponds to one document and contains words and their tf-idf weights """ docs_dict = Dictionary(spacy_contexts) docs_dict.compactify() docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts] model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict) docs_tfidf = model_tfidf[docs_corpus] # Now generate a list of dicts with k,v = "word": tfidf_frequency # each dict contains words from one document (sentence) doc_tfidf_dicts = [] for doc in docs_tfidf: d = dict() for term, freq in doc: d[docs_dict[term]] = freq doc_tfidf_dicts.append(d) return doc_tfidf_dicts
def gensim_similarity(data_c): """ 使用Gensim包计算相似度: 词频 COUNT LDA LSI Tfidf: TFIDF LDA LSI """ # 合并获取词袋 data_c['s1'] = data_c['s1'].apply(lambda text: list(text)) data_c['s2'] = data_c['s2'].apply(lambda text: list(text)) data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s') # 构建词典 print("starting create dic....") dic = corpora.Dictionary(data_c['s1'].values) dic.add_documents(data_c['s2'].values) print("文档数:", dic.num_docs) print("starting create count bow...") data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text)) data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text)) data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text)) # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)] # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)] cps1 = list(data_c['s1']) cps2 = list(data_c['s2']) cps = list(data_c_all['s']) # 计算s1,s2词频相似度 print("starting count similarity....") sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000) count_sm = np.diag(sm[cps2]) # 计算s1,s2词频LDA,LSI相似度 count_lda_sm = lda_similarity(cps, cps1, cps2, dic) # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic) # 计算s1,s2 tfidf相似度 print("starting tfidf similarity....") tfidf = TfidfModel(corpus=cps, id2word=dic) cps1_tfidf = tfidf[cps1] cps2_tfidf = tfidf[cps2] cps_tfidf = tfidf[cps] # 计算s1,s2 TFIDF相似度 sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000) tfidf_sm = np.diag(sm[cps2_tfidf]) # 计算s1,s2词频LDA,LSI相似度 tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
def get_lda_feature(): doc_train = pd.read_csv(id_content_path) documents = doc_train['content'].apply(lambda x: x.split(' ')) # 建立词和ID的映射字典(id:word) dictionary = corpora.Dictionary(documents) # 建立文档和id和list(tuple(id,num)) of list df ds_df = [dictionary.doc2bow(document) for document in documents] # 建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df tfidf_model = TfidfModel(ds_df) # 获取文档的tdf获取文档tfidf ds_tfidf = tfidf_model[ds_df] # 定义文档的主题个数 n = 60 # 构建lda模型,输入参数是文档的tfidf,并指明主题的个数 lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12) vec_size = (len(documents), n) lda_feature = np.zeros(vec_size) i = 0 for doc in ds_tfidf: topics = lda_model.get_document_topics(doc, minimum_probability=0.01) for topic in topics: num_topic = topic[0] prob = round(topic[1], 5) lda_feature[i, num_topic] = prob i += 1 f_names = get_lda_feacture_name(n) pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path, index=0)
def get_tfidf(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict) docs_tfidf = model_tfidf[docs_corpus] docs_vecs = np.vstack( [sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) return docs_vecs
def get_tfidf_model(path="data/swiki.json", save_path="data/swiki_dict.txt", stem=False): """ :param path: :param save_path: :return: """ texts = map(lambda x: _preprocess_text(x, stem=stem), _load_json_list("data/swiki.json")) def _get_swiki_dictionary(): dict_file = os.path.join(BASE_DIR, save_path) if os.path.exists(dict_file): dictionary = corpora.Dictionary.load_from_text(dict_file) else: dictionary = corpora.Dictionary(texts) dictionary.save_as_text(dict_file) return dictionary dct = _get_swiki_dictionary() bow_texts = map(dct.doc2bow, texts) tfidf = TfidfModel(bow_texts) return dct, tfidf
def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model
def lda(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] #id2word = dictionary.id2word print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' train lda Model...') ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) #ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpusD, num_topics=topicNum, update_every=1, chunksize=8000, passes=10) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get lda feature...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return ldaFeature
def getLsiFeature(documents, topicNum): ''' Funciton: generate lsi features by training lsi model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lsi features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lsi model # LogInfo(' Train LSI model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) # generate lsi features LogInfo(' Generate LSI features...') lsiFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlsi") lsiFeature = pd.DataFrame(lsiFeature, columns = colName) return lsiFeature
def getLdaFeature(documents, topicNum): ''' Funciton: generate lda features by training lda model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lda features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lda model # LogInfo(' Train LDA model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] # ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12) # generate lda features LogInfo(' Generate LDA features...') ldaFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpus_tfidf: topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01) for t in topic: ldaFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlda") ldaFeature = pd.DataFrame(ldaFeature, columns = colName) return ldaFeature
def lsi(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) lsiFeature = np.zeros((len(texts), topicNum)) print('translate...') i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return lsiFeature
def loadCorpus(self, mmfile, dictfile, doctuplesfile=None): self.corpus = corpora.MmCorpus(mmfile) self.dictionary = corpora.Dictionary.load(dictfile) if doctuplesfile != None: with open(doctuplesfile, 'rb') as docpicklef: self.doctuples = pickle.load(docpicklef) if self.toweight: self.tfidf = TfidfModel(self.corpus)
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Gensim vectorizer """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): if self.lexicon == None or self.tfidf == None: inputDocuments = list(documents) self.lexicon = Dictionary(inputDocuments) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in inputDocuments], id2word=self.lexicon) self.save() return self else: return self def transform(self, documents): returnDocs = [] for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: returnDocs.append(sparse2full(vec)) else: returnDocs.append(vec) return returnDocs
def compute_tfidf(): from gensim.models.tfidfmodel import TfidfModel keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words( ) with time_code('compute_tfidf'): tfidf = TfidfModel(corpus, smartirs='ltc', id2word=int2word) return tfidf
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
def main(): parser = ArgumentParser() parser.add_argument('-e', '--encoding') parser.add_argument('-o', '--output-file') args = parser.parse_args() encoding = args.encoding output_fn = args.output_file if not output_fn: sys.exit(-1) if encoding: sys.stdout = codecs.getwriter(encoding)(sys.stdout) sys.stdin = codecs.getreader(encoding)(sys.stdin) texts = (line.split() for line in sys.stdin) logging.info('Creating vocabulary ...') vocab = Dictionary(texts) logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2')) vocab.save(output_fn) logging.info('Compressing vocabulary ...') with open(output_fn, 'rb') as input: with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn) logging.info('Creating IDF model ...') tfidf = TfidfModel(dictionary=vocab) logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2')) tfidf.save(output_fn + '.tfidf') logging.info('Compressing IDF model ...') with open(output_fn + '.tfidf', 'rb') as input: with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn + '.tfidf')
def predict_on_group(model, docs_data, word2vec_model300, length=5) -> 'pd.DataFrame of type : pair_id || target': """ Parameters: model -- model object with methods train and predict docs_data -- pandas Data Frame with fields pair_id, content, target word2vec_model300 -- w2v model (object) Returns: pd.DataFrame of type : { pair_id || target } with predicted target for each pair_id """ dictionary = corpora.Dictionary() for i in docs_data.content: try: dictionary.add_documents([i]) except: dictionary.add_documents([['a']]) docs_data['vector'] = docs_data.content.apply(doc_opti, args=(dictionary, )) # except: # docs_data['vector'] = docs_data.content.apply(dictionary.doc2bow) corpus = [] for line in docs_data.content: try: if math.isnan(line): line = ["мимо"] except: pass corpus = corpus + [dictionary.doc2bow(line)] similarity_matrix = word2vec_model300.similarity_matrix( dictionary, tfidf=TfidfModel(corpus, dictionary=dictionary), threshold=0.0, exponent=2.0, nonzero_limit=100) docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec, args=(docs_data.vector, similarity_matrix)) features = [str(i) for i in range(length)] for i in range(length): docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, )) docs_data['target'] = model.predict(np.array(docs_data[features])) return docs_data[['pair_id', 'target']]
def train_model_on_group(model, docs_data, word2vec_model300, length=5): """ Parameters: model -- model object with methods train and predict docs_data -- pandas Data Frame with fields pair_id, content, target word2vec_model300 -- w2v model (object) Returns: model trained on data """ dictionary = corpora.Dictionary() for i in docs_data.content: try: dictionary.add_documents([i]) except: dictionary.add_documents([['a']]) docs_data['vector'] = docs_data.content.apply(doc_opti, args=(dictionary, )) corpus = [] for line in docs_data.content: try: if math.isnan(line): line = ["мимо"] except: pass corpus = corpus + [dictionary.doc2bow(line)] similarity_matrix = word2vec_model300.similarity_matrix( dictionary, tfidf=TfidfModel(corpus, dictionary=dictionary), threshold=0.0, exponent=2.0, nonzero_limit=100) docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec, args=(docs_data.vector, similarity_matrix)) features = [str(i) for i in range(length)] for i in range(length): docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, )) print(docs_data.head()) model = model.fit(docs_data[features], docs_data['target']) print(model.score(docs_data[features], docs_data['target'])) return model
def corpus_vec(docs, model, corpus, size = DEFAULT_SAMPLE_SIZE): """ Creates a NxD array of document vectors for each document in a list""" tfidf = TfidfModel(corpus) N,D = len(docs), model.wv.syn0.shape[1] arr = np.empty((N, D)) for i in range(N): arr[i,:] = doc_vec(docs[i], model, corpus, size, tfidf) return arr
def __init__(self, docs, num_option=OPTION_GROUP, usr_option=OPTION_GROUP, url_option=OPTION_GROUP, emo_option=OPTION_GROUP, lc=True, del_dup=True, del_punc=False, del_diac=True, token_list=[-1], token_min_filter=-1, token_max_filter=1.0, tfidf=True, **kwargs): self.del_diac = del_diac self.num_option = num_option self.usr_option = usr_option self.url_option = url_option self.emo_option = emo_option self.lc = lc self.del_dup = del_dup self.del_punc = del_punc self.token_list = token_list self.token_min_filter = token_min_filter self.token_max_filter = token_max_filter self.tfidf = tfidf self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'} if emo_option == OPTION_NONE: self.emo_map = None else: # self.emo_map = get_compiled_map(os.path.join(os.path.dirname(__file__), 'resources', 'emoticons.json')) self.emo_map = EmoticonClassifier() docs = [self.tokenize(d) for d in docs] self.dictionary = corpora.Dictionary(docs) corpus = [self.dictionary.doc2bow(d) for d in docs] if self.token_min_filter != 1 or self.token_max_filter != 1.0: if self.token_min_filter < 0: self.token_min_filter = abs(self.token_min_filter) else: self.token_min_filter = int( len(corpus) * self.token_min_filter) if self.token_max_filter < 0: self.token_max_filter = abs( self.token_max_filter) / len(corpus) self.dictionary.filter_extremes(no_below=self.token_min_filter, no_above=self.token_max_filter, keep_n=None) if self.tfidf: self.model = TfidfModel(corpus) else: self.model = None
def __init__(self, documents): self.documents = documents self.texts = [[word for word in document.lower().split()] for document in documents] self.dictionary = corpora.Dictionary(self.texts) self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] self.tfidf = TfidfModel(self.corpus) self._make_random_indexing() print "initialized!"
def tf_idf(dataSeg_save): corpus = pd.read_csv(dataSeg_save,header=None)[0] texts = [sentence.split(' ') for sentence in corpus] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tf_idf_model = TfidfModel(corpus, normalize=False) word_tf_tdf = list(tf_idf_model[corpus]) # print('词典:', dictionary.token2id) # print('词频:', corpus) # print('词的tf-idf值:', word_tf_tdf) return word_tf_tdf,dictionary.token2id
def __init__(self, analyzed_items_path=None, dictionary_path=None, corpus_path=None, tfidf_model_path=None): if dictionary_path: self.dictionary = Dictionary.load(dictionary_path) else: self.dictionary = None if analyzed_items_path: self.analyzed_items_path = analyzed_items_path else: self.analyzed_items_path = None if corpus_path: self.corpus = MmCorpus(corpus_path) else: self.corpus = None if tfidf_model_path: self.tfidf_model = TfidfModel.load(tfidf_model_path) else: self.tfidf_model = None
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name']) human_data_file = path.join(base_path, p['human_data_file']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() logger.info('loading word mapping') dictionary = Dictionary.load(path.join(base_path, p['corpus_path'], p['dict_name'])) Dictionary.save(dictionary, path.join(output_dir, p['dict_name'])) logger.info(dictionary) logger.info('loading corpus') corpus_bow = MmCorpus(working_corpus) logger.info("create preprocessing model and save it to disk") if p['pre_model'] == 'tfidf': pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) elif p['pre_model'] == 'log_ent': pre_model = LogEntropyModel(corpus_bow, id2word=dictionary, normalize=True) else: raise ValueError('model parameter %s not known' % p['pre_model']) pre_model.save(os.path.join(output_dir, p['pre_model_extension'])) logger.info('initialize LSI model') lsi = models.LsiModel(pre_model[corpus_bow], id2word=dictionary, num_topics=p['num_topics']) lsi.save(os.path.join(output_dir, p['lsi_extension'])) logger.info('finished --> lsi model saved to: %s' % os.path.join(output_dir, p['lsi_extension'])) # check for correlation with lee human data logger.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[pre_model[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(human_data_file) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("correlation with lee human data: %f" % cor[0, 1]) dif = start - datetime.now() logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def construct_tfidf_model(self, model_path): model = TfidfModel(self.corpus) model.save(model_path) return model
def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self
def load_tfidf_model (self, filename='../data/models/tfidf_model'): self.tfidf_model = TfidfModel.load (filename)
from gensim.corpora import Dictionary class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'): tfidf = TfidfModel.load('tfidf.pkl') dictionary = Dictionary.load('nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save('nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)