def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model
def get_tfidf_model(): if os.path.isfile(TFIDF_FILE): return TfidfModel.load(TFIDF_FILE) else: model = TfidfModel(get_corpus(), get_dictionary()) model.save(TFIDF_FILE) return model
def cal_tfidf(documents, topk=10) -> List: """ tfidf模型训练 :param documents: 要进行训练的文档 :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词 :return: """ # 单个文档分成列表 docs = [[word for word in document.split(' ')] for document in documents] # 生成字典 dictionary = corpora.Dictionary(docs) # 生成bag of word docs_bow = [dictionary.doc2bow(doc) for doc in docs] if os.path.isfile(tfidfmodel): model = TfidfModel.load(tfidfmodel) else: model = TfidfModel(docs_bow) model.save(tfidfmodel) # 生成文本向量 docs_vector = list(model[docs_bow]) # 对所有的文本向量进行排序,取钱topk docs_sort_vector = [ sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector ] # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表 docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc] for doc in docs_sort_vector] return docs_sort_chinese
def transformModel(modelType, inputModel="", dictionary=""): #check if using default dict or lcoation passed as parameter if dictionary == "": dictionary = corpora.Dictionary.load('dictionaries/testNewsgroupsDictionary.dict') print dictionary #sys.exit(1) else: fileName = 'dictionaries/'+str(dictionary) dictionary = corpora.Dictionary.load(fileName) #use default stored model; mm format if inputModel == "": inputModel = TfidfModel.load("models/testNewsgroups.tfidf_model") #print inputModel else: fileName = 'models/'+str(inputModel) corpus = corpora.MmCorpus(inputModel) inputModel = models.TfidfModel(corpus) #create model handlers if modelType == "": print "Chose output model for selected input file: \n 1 -> LSI model\n 2 -> LDA model\n 3 -> LogEntropy model\n Pass it as the third parameter" sys.exit(1) elif modelType == 1: model = models.LsiModel(inputModel,id2word=dictionary) elif modelType == 2: model = models.LdaModel(inputModel,id2word=dictionary) elif type == 3: model = models.LogEntropyModel(inputModel,id2word=dictionary) else: errorMessage("Something went wrong with the type identificator") return model
def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora' ]) self.dictionary = corpora.Dictionary.load( os.path.join(corpora_folder, "%s.dict" % (vocabulary, ))) self.corpus = corpora.MmCorpus( os.path.join(corpora_folder, "%s.mm" % (vocabulary, ))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models' ]) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath)
def buildTfidfModel(corpus): print('get tfidf model...') if not os.path.exists(modelpath + 'tfidf.model'): # 构造tfidf向量 tfidf = TfidfModel(corpus) tfidf.save(modelpath + 'tfidf.model') else: tfidf = TfidfModel.load(modelpath + 'tfidf.model') print('done') return tfidf
def __getitem__(self, modelo): ''' Retorna o modelo correspondente. Parâmetros: modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: o modelo solicitado, se existir ''' if not os.path.isfile(self._arqs['modelos'][modelo]): print(f'O modelo "{modelo} não foi implementado ou montado."') return None if modelo in ['tfidf', 'tfidf_pivot']: model = TfidfModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lsi': model = LsiModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lda': model = LdaModel.load(self._arqs['modelos'][modelo]) elif modelo == 'doc2vec': model = Doc2Vec.load(self._arqs['modelos'][modelo]) return model
def init(): #初始化一些全局变量 global dictionary global tfidf global accusation_list global law_list dictionary = corpora.Dictionary.load(modelpath + 'dictionary.model') tfidf = TfidfModel.load(modelpath + 'tfidf.model') fin = open(lawPath, 'r') line = fin.readline() while line: line = line.split() law_list.append([int(line[0]), int(line[1])]) line = fin.readline() fin.close() for i, v in enumerate(law_list): law_dic[str(v)] = i tobe_law[i] = v
def __init__(self, analyzed_items_path=None, dictionary_path=None, corpus_path=None, tfidf_model_path=None): if dictionary_path: self.dictionary = Dictionary.load(dictionary_path) else: self.dictionary = None if analyzed_items_path: self.analyzed_items_path = analyzed_items_path else: self.analyzed_items_path = None if corpus_path: self.corpus = MmCorpus(corpus_path) else: self.corpus = None if tfidf_model_path: self.tfidf_model = TfidfModel.load(tfidf_model_path) else: self.tfidf_model = None
def __init__(self, analyzed_items_path=None, dictionary_path=None, corpus_path=None, tfidf_model_path=None): if dictionary_path: self.dictionary = Dictionary.load(dictionary_path) else: self.dictionary = None if analyzed_items_path: self.analyzed_items_path = analyzed_items_path else: self.analyzed_items_path = None if corpus_path: self.corpus = MmCorpus(corpus_path) else: self.corpus = None if tfidf_model_path: self.tfidf_model = TfidfModel.load(tfidf_model_path) else: self.tfidf_model = None
def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora']) self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,))) self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath)
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize #.download('words') words = set(nltk.corpus.words.words()) from gensim.models.tfidfmodel import TfidfModel from gensim import similarities, models, corpora, utils from gensim.test.utils import datapath, get_tmpfile os.chdir('K:\DS project') #============================================ path = os.getcwd() df = pd.read_csv('df3.csv') dictionary = utils.SaveLoad.load(path + '\\Ds projectdim_items_terms.dict') corpus = corpora.MmCorpus(path + '\\Ds projectdim_items_terms.mm') tfidf = TfidfModel.load(fname=path + '\\Ds projectdim_items_terms.tfidf') sims = utils.SaveLoad._load_specials(path + '\\Ds projectdim_items_terms.similarity') app = Flask(__name__) @app.route("/") def projectname(): return render_template("name.HTML") @app.route("/intro") def intro(): return render_template("1stpage.HTML", methods=["POST"])
class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [ TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset'] ] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'): tfidf = TfidfModel.load('tfidf.pkl') dictionary = Dictionary.load('nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save('nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)
def __iter__(self): data = json.load(open('../data/nasa.json')) desc = [ TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset'] ] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('../data/tfidf.pkl') and os.path.exists( '../data/nasa_dictionary.pkl'): tfidf = TfidfModel.load('../data/tfidf.pkl') dictionary = Dictionary.load('../data/nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save(self, '../data/nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('../data/tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)
def load_tfidf_model (self, filename='../data/models/tfidf_model'): self.tfidf_model = TfidfModel.load (filename)
comments_dictionary.save(FLAGS.dictFile) else: print("Loading dictionary...") comments_dictionary = Dictionary.load(FLAGS.dictFile) print("Converting to BOW vectors...") comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] model_tfidf = None if doTrain: print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) model_tfidf.save(FLAGS.tfidfFile) else: print("Loading tfidf model...") model_tfidf = TfidfModel.load(FLAGS.tfidfFile) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = np.vstack( [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf]) chi2_features = None if doTrain: # Find most descrimitive words for any of the labels print("Finding discrimitive features...") labels = np.array(data['any']) model_fpr = SelectFpr(chi2, alpha=0.025) model_fpr.fit(comments_vecs, labels) chi2_features = model_fpr.get_support(indices=True) np.save(FLAGS.chi2File, chi2_features)
def train_model(corpus_path, dic_conf, lda_conf): logging.info('Loading corpus from file {}'.format(corpus_path)) corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250) # corpus = LineSentence(corpus_path, 10000000) print '-' * 80 if lda_conf["build_dict"]: logging.info("Building dictionary ...") dic = Dictionary(corpus) dic.filter_extremes(no_below=dic_conf["min_tf"], no_above=dic_conf["max_df"], keep_n=dic_conf["vocab_size"]) dic.compactify() logging.info("Saving dictionary ...") dic.save(dic_conf["dic"]) else: logging.info("Loading dictionary ..") dic = Dictionary.load(dic_conf["dic"]) bow = IntCorpus(corpus, dic) l = len(bow) print l tfMod = TfidfModel.load(lda_conf["tfmod"]) #save corpus to disk for later usage # logging.info("Saving corpus to disk ...") # MmCorpus.serialize("data/corpus.mm", bow) # bow = MmCorpus("data/large_corpus.mm") print '-' * 80 if lda_conf["new"]: logging.info("Training new lda model") logging.info("Loading defined keywords ...") keywords = {} topics = [] with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f: for l in f: sp = l.strip().split(':') topic = int(sp[0]) topics.append(sp[1]) kws = sp[2].split(',') for kw in kws: if kw not in keywords: keywords[kw] = set([topic]) else: keywords[kw].add(topic) #keywords[kw.lower()] = topic logging.info("Number of defined keywords: {}".format(len(keywords))) if lda_conf["threads"] <= 1: model = LdaModelNew(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], defined_kws=keywords, alpha='auto', eval_every=lda_conf["eval_every"]) else: logging.info("Training model using mutlicore lda version") model = LdaMulticoreNew(corpus=bow, id2word=dic, workers=lda_conf["threads"], iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], defined_kws=keywords, alpha='symmetric', chunksize=lda_conf["chunksize"], eval_every=lda_conf["eval_every"], tfMod=tfMod, topic_names=topics) else: logging.info("Training ldamodel implemented in gensim") model = LdaModelOld(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], alpha='auto', eval_every=lda_conf["eval_every"]) logging.info('Saving lda model to {}'.format(lda_conf["model_path"])) model.save(lda_conf["model_path"]) logging.info('Saving model done!')
from gensim.corpora import Dictionary class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'): tfidf = TfidfModel.load('tfidf.pkl') dictionary = Dictionary.load('nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save('nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)