def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus
def createLsiModelforCorpus(corpusfile, dictfile, numtop): print "\nLoading dictionary..." dict = corpora.Dictionary.load_from_text(dictfile) print(dict) print "\nLoading corpus..." corpus = corpora.MmCorpus(corpusfile) print(corpus) print "\nPerforming Latent Semantic Indexing..." lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False) ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault) #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics) corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False) rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath) os.makedirs(folderpath) else: os.makedirs(folderpath) os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' lsi.save(lsimodelfile) filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl' with open(filename1,'wb') as output: pickle.dump(corpustopics, output) os.chdir(rootdir) return corpustopics, lsi
def lsi_model(self, num_topics: int = 10, stochastic: bool = False): """ Construct LSI topic models for each year in a corpus, given a set of parameters. """ if self.word_to_id is None or self.corpora is None: self.build_dictionaries_and_corpora() if self.tf_idf_models is None: self.build_tf_idf_models() results = num_dict(self.year_list) if not stochastic: for year in self.year_list[:-1]: results[year] = \ LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]], id2word=self.word_to_id[year], num_topics=num_topics ) else: for year in self.year_list[:-1]: results[year] = \ LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]], id2word=self.word_to_id[year], num_topics=num_topics, onepass=False ) return TopicResults(results, self.num_docs)
def train_model(filename, output_name, data={}): output = data output['dataset'] = filename output['output_name'] = output_name df = pd.read_csv('./data/dataset/%s' % filename) lemmas_list = [] for lemmas in df['lemmas']: lemmas = str(lemmas) lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '') lemmas_list.append(lemmas.split()) dictionary = corpora.Dictionary(lemmas_list) make_dir('./data/dicts/') dictionary.save('./data/dicts/%s_corpus.dict' % output_name) output['dict'] = '%s_corpus.dict' % output_name clean_doc = [dictionary.doc2bow(text) for text in lemmas_list] tfidf = models.TfidfModel(clean_doc, normalize=True) lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200) make_dir('./data/models') lsi.save('./data/models/%s_model.txt' % output_name) output['model'] = '%s_model.txt' % output_name return output
def lsi(self): self.tf_idf() if self.corpus_tf_idf and self.dictionary: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] print self.lsi_model.print_topic(2) elif self.corpus_tf_idf: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False): """ """ self.num_components = num_components # Train LSI based on training dataset self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \ num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation # Convert bow into LSI projections self.corpus_lsi = self.lsi[self.training_corpus]
def train(self, tokens): """ Trains the LSI model Parameters ---------- tokens: list of list of str e.g. [['hi', 'ho'], ['my', 'name', ...], ...] """ self.fill_dictionary(tokens) corpus = self.to_corpus(tokens) self.tfidf = TfidfModel(corpus) corpus = self.tfidf[corpus] self.lsi = LsiModel(corpus, num_topics=self.num_topics)
def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model
def compute_lda(): # from gensim.models.ldamulticore import LdaMulticore from gensim.models.lsimodel import LsiModel keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words( ) try: len(corpus) except: for doc in iter(corpus): pass host = os.environ.get('pyro_ns_host', None) port = int(os.environ.get('pyro_ns_port', 0)) or None tfidf = compute_tfidf() with time_code('compute_lda'): corpus_tfidf = tfidf[corpus] lda = LsiModel(corpus_tfidf, num_topics=500, id2word=int2word, distributed=True, ns_conf=dict( host=host, port=port, broadcast=port and host, )) # lda = LdaMulticore(corpus_tfidf, num_topics=500, id2word=int2word, workers=None) return lda
def getLsiFeature(documents, topicNum): ''' Funciton: generate lsi features by training lsi model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lsi features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lsi model # LogInfo(' Train LSI model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) # generate lsi features LogInfo(' Generate LSI features...') lsiFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlsi") lsiFeature = pd.DataFrame(lsiFeature, columns = colName) return lsiFeature
def lsi(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) lsiFeature = np.zeros((len(texts), topicNum)) print('translate...') i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return lsiFeature
def fit_lda(X, vocab, num_topics=50, passes=1): """ Fit LDA from a scipy CSR matrix (X). """ print 'fitting lda...' return LsiModel(gensim.matutils.Sparse2Corpus(X, documents_columns=False), num_topics=num_topics, chunksize=10000, id2word=vocab)
def generate_docs_lsi(self, dictionary_file_path, tfidf_file_path, lsi_file_path, num_topics=100): """ 生成文档库lsi降维文件 :param dictionary_file_path: :param tfidf_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) print tfidf_corpus lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # lsi.print_topics(10) with open(lsi_file_path, 'wb') as f: pickle.dump(lsi, f) logger.info('lsi model file building finished') # doc_lsi = lsi[doc_bow] except Exception as e: logger.error( 'generate documents library lsi model file failed for %s' % str(e))
def load(self, path='default'): """ :param path: the path of trained model. :return: """ if path == 'default': path = 'model' file_list = os.listdir(path) for file in file_list: if file.endswith('.model'): self.model_name = file.split('.')[0] if self.model_name == 'lda': self.model = LdaModel.load(str(path + '/lda.model')) if self.model_name == 'lsi': self.model = LsiModel.load(str(path + '/lsi.model')) if self.model_name == 'hdp': self.model = HdpModel.load(str(path + '/hdp.model')) self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.model.get_topics().shape[0] else: self.num_topics = self.model.num_topics #self.iterations = self.model.iterations f = open(str(path + '/original_data.pickle'), 'rb') self.original_data = pickle.load(f) f.close() f = open(str(path + '/text.pickle'), 'rb') self.text = pickle.load(f) f.close() f = open(str(path + '/token.pickle'), 'rb') self.token = pickle.load(f) f.close() f = open(str(path + '/corpus.pickle'), 'rb') self.corpus = pickle.load(f) f.close() path = path + '/result' f = open(str(path + '/topic_key.pickle'), 'rb') self.topic_key = pickle.load(f) f.close() f = open(str(path + '/doc_topic.pickle'), 'rb') self.doc_topic = pickle.load(f) f.close() f = open(str(path + '/topic_doc.pickle'), 'rb') self.topic_doc = pickle.load(f) f.close() f = open(str(path + '/topic_sent.pickle'), 'rb') self.topic_sent = pickle.load(f) f.close() self.id2word = self.model.id2word if self.model_name == 'hdp': self.num_topics = self.topic_doc.shape[0] else: self.num_topics = self.model.num_topics
def run(): try: print "starting to build LSI Model" start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print "number_of_documents:", number_of_documents stopwords = [] stopwords += [month.lower() for month in month_to_number.keys()] stopwords += nltk_stopwords.words('english') print "stopwords:", len(stopwords) with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] dictionary = Dictionary(texts) print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print "corpus:", type(corpus) print "generating lsi model" lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print "saving LSI model" lsi.save(path_to_directory_of_this_file + "/model") Topic.objects.all().delete() topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) except Exception as e: print e
def lsi_similarity(cps, cps1, cps2, dic): # 计算s1,s2词频LSI相似度 print("starting lsi similarity....") lsi = LsiModel(corpus=cps, num_topics=100, id2word=dic) s1_lsi = lsi[cps1] s2_lsi = lsi[cps2] sm = similarities.MatrixSimilarity(corpus=s1_lsi, num_features=lsi.num_topics) lsi_sm = np.diag(sm[s2_lsi]) return lsi_sm
def from_text_files_in_path(self, path, extension=".txt"): doc_id = 0 for tokens in self.training_documents_from_path(path, extension): document = {'id': "doc_" + str(doc_id), 'tokens': tokens} doc_id = doc_id + 1 if self.model: self.model.add_documents(document) else: self.model = LsiModel(document) return self.model
def representation(self): if not self.model: print("LOAD MODEL...") self.model = LsiModel.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.model')) self.dictionary = Dictionary.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.dic')) pass
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No LSA file exists but from_scratch is False') trigram_dictionary = self.lda_builder.get_corpus_dict() trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary) print('Building LSA model...') lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics) lsi.save(filepath) print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath)) else: print('Loading LSA model (n_topics={})...'.format(n_topics)) lsi = LsiModel.load(filepath) return lsi
def cluster(sentences): my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'} corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences] corpus = np.array(corpus) tf_vectorizer = TfidfVectorizer(decode_error='ignore', max_df=0.7, stop_words=my_stop_words.union(stop_words), ngram_range=(1, 1)) tf_mat = tf_vectorizer.fit_transform(corpus) id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())} n_topics = 5 lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T), num_topics=n_topics, id2word=id2word, onepass=False) gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)] lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T norm = Normalizer(copy=False) lsi_mat = norm.fit_transform(lsi_mat) valid_indices = np.where(lsi_mat.any(axis=1))[0] valid_sent = lsi_mat[valid_indices] n_clusters = 7 cluster = KMeans(n_clusters, n_init=100) cluster.fit(valid_sent) clusters = {} for i in range(n_clusters): clusters[i] = np.where(cluster.labels_ == i)[0] for i in clusters.keys(): if np.sum( np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i] )) > cluster.inertia_ / n_clusters: del clusters[i] last_cluster = [ valid_indices[clusters[i][np.where( np.sum(np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]), axis=1) < cluster.inertia_ / len(corpus))]].tolist() for i in clusters ] return last_cluster
class MyModel: def __init__(self, dict_file=None, corpus_model=None, corpus_file=None): self.dict_file = dict_file self.dictionary = None self.corpus = None if dict_file is not None: self.dictionary = corpora.Dictionary.load(dict_file) if corpus_model: self.corpus = self.corpus_model elif corpus_file: self.corpus = corpora.MmCorpus(corpus_file) self.tf_idf_model = None self.corpus_tf_idf = None self.lsi_model = None self.corpus_lsi = None self.lda_model = None self.corpus_lda = None def tf_idf(self): self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True) # corpus_vector = [vector for vector in self.corpus] self.corpus_tf_idf = self.tf_idf_model[self.corpus] def lsi(self): self.tf_idf() if self.corpus_tf_idf and self.dictionary: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] print self.lsi_model.print_topic(2) elif self.corpus_tf_idf: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] def lda(self): self.lda_model = models.LsiModel(corpus=self.corpus) self.corpus_lda = self.lda_model[self.corpus] def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf): self.lsi_model.add_documents(addition_corpus_tf_idf) lsi_vector = self.lsi_model[addition_vector_tf_idf] return lsi_vector def save_lsi(self, name='/serialise/model.lsi'): self.lsi_model.save(name) def save_lda(self, name='/serialise/model.lda'): self.lda_model.save(name) @staticmethod def load_lsi(name='/tmp/model.lsi'): my_model = MyModel() my_model.lsi_model = models.LsiModel.load(name) return my_model
def __create_model(self, algo, topic_qtt): model = None if (algo == TopicModelingAlgorithm.LDA): model = LdaModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words, random_state=1) elif (algo == TopicModelingAlgorithm.LSA): model = LsiModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words) elif (algo == TopicModelingAlgorithm.NMF): model = Nmf(corpus=self.__corpus, num_topics=topic_qtt, random_state=1) return model
def __getitem__(self, modelo): ''' Retorna o modelo correspondente. Parâmetros: modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: o modelo solicitado, se existir ''' if not os.path.isfile(self._arqs['modelos'][modelo]): print(f'O modelo "{modelo} não foi implementado ou montado."') return None if modelo in ['tfidf', 'tfidf_pivot']: model = TfidfModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lsi': model = LsiModel.load(self._arqs['modelos'][modelo]) elif modelo == 'lda': model = LdaModel.load(self._arqs['modelos'][modelo]) elif modelo == 'doc2vec': model = Doc2Vec.load(self._arqs['modelos'][modelo]) return model
def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('preprocessing...') self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = tfidf_model[self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
def build_similarity(self, corpus: List[tuple], model='tfidf') -> None: """ Builds a similarity model for a bag of words corpus :param corpus: to build the similarity model :param model: strategy """ from gensim.models.tfidfmodel import TfidfModel from gensim.models.lsimodel import LsiModel from gensim import similarities self.dictionary.compactify() if model == 'tfidf': self.model = TfidfModel(corpus, id2word=self.dictionary) elif model == 'lsi': # todo: remove magic number self.model = LsiModel(corpus, id2word=self.dictionary, num_topics=2) feature_cnt = len(self.dictionary.token2id) self.index = similarities.SparseMatrixSimilarity( self.model[corpus], num_features=feature_cnt)
parser.add_argument("--nwords","-nw",help="Input desired number of words to show per topic",default=10, required=False, type=int) args = parser.parse_args() start_time=time.time() print "\nLoading dictionary..." dict = corpora.Dictionary.load_from_text(args.dict) print(dict) print "\nLoading corpus..." corpus = corpora.MmCorpus(args.corpus) print(corpus) print "\nPerforming Latent Semantic Indexing..." lsi = LsiModel(corpus=corpus, num_topics=args.ntopics, id2word=dict, distributed=False) ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault) #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics) #if len(args.query)!=1: #print corpus[args.query] queryresult = lsi[corpus[args.query]] sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True) #screenqueryresult = sorted(list(queryresult), key=itemgetter(1)) #screenoutput = lsi.print_topics(num_topics=10, num_words=1) #output = lsi.print_topics(num_topics=10, num_words=10) #print "\nResult:" #pp.pprint(screenoutput) #lsi.save('lsi_result.txt')
def modelSelectionLSI(self): """ Lets find the optimal parameters for LSI for all fields. We see the optimal number of parameters for the training set of experts. """ coverages = numpy.zeros((len(self.ks), len(self.minDfs), len(self.gammas), len(self.fields))) logging.getLogger('gensim').setLevel(logging.INFO) maxK = numpy.max(self.ks) logging.debug("Starting model selection for LSI") for t, minDf in enumerate(self.minDfs): logging.debug("Using minDf=" + str(minDf)) self.minDf = minDf self.vectoriseDocuments() self.loadVectoriser() corpus = gensim.corpora.mmcorpus.MmCorpus(self.docTermMatrixFilename + ".mtx") id2WordDict = dict(zip(range(len(self.vectoriser.get_feature_names())), self.vectoriser.get_feature_names())) logging.debug("Running LSI with " + str(maxK) + " dimensions") lsi = LsiModel(corpus, num_topics=maxK, id2word=id2WordDict, chunksize=self.chunksize, distributed=False, onepass=False) for i, k in enumerate(self.ks): lsi.num_topics = k logging.debug("Creating index") index = gensim.similarities.docsim.Similarity(self.indexFilename, lsi[corpus], num_features=k) for j, field in enumerate(self.fields): logging.debug("k="+str(k) + " and field=" + str(field)) newX = self.vectoriser.transform([field]) newX = [(s, newX[0, s])for s in newX.nonzero()[1]] result = lsi[newX] similarities = index[result] for u, gamma in enumerate(self.gammas): self.gamma = gamma expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field) expertMatches = self.matchExperts(expertsByDocSimilarity, set(self.trainExpertDict[field])) coverages[i, t, u, j] = float(len(expertMatches))/len(self.trainExpertDict[field]) for u, gamma in enumerate(self.gammas): logging.debug("Mean coverage for gamma=" + str(gamma) + " " + str(numpy.mean(coverages[i, t, u, :]))) meanCoverges = numpy.mean(coverages, 3) logging.debug(meanCoverges) bestInds = numpy.unravel_index(numpy.argmax(meanCoverges), meanCoverges.shape) self.k = self.ks[bestInds[0]] logging.debug("Chosen k=" + str(self.k)) self.minDf = self.minDfs[bestInds[1]] logging.debug("Chosen minDf=" + str(self.minDf)) self.gamma = self.gammas[bestInds[2]] logging.debug("Chosen gamma=" + str(self.gamma)) logging.debug("Coverage = " + str(numpy.max(meanCoverges))) return meanCoverges
def main(): start_time=time.time() rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==False or (os.path.exists(folderpath)==True and args.force==True)): topics, lsi = createLsiModelforCorpus(args.corpus, args.dict, args.ntopics) else: os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' topicsfile=(str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl' modelpath=os.path.join(folderpath,lsimodelfile) topicspath=os.path.join(folderpath,topicsfile) lsi = LsiModel.load(modelpath) topics=pickle.load(open(topicspath,'r')) f = open('lsi_corpus_topics.txt','w') f.seek(0) f.write(str(topics)) f.close() os.chdir(rootdir) pp.pprint(lsi.show_topics(num_topics=args.ntopics, num_words=10, log=False, formatted=True)) corpus = corpora.MmCorpus(args.corpus) if args.query!=-1: queryresult = lsi[corpus[args.query]] sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True) print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query) pp.pprint(sortedqueryresult) # Generate topic probability-document matrix, along with vector containing most probable topic (assumed to be the label) for each document #os.chdir(folderpath) outlabel_name = 'lsi_document_labels_{0}.txt'.format((args.corpus).replace('.mm','')) outtopic_name = 'lsi_topic_vectors_{0}.txt'.format((args.corpus).replace('.mm','')) outlabelpath=os.path.join(folderpath,outlabel_name) outtopicpath=os.path.join(folderpath,outtopic_name) if (os.path.exists(outlabelpath)==False or os.path.exists(outtopicpath)==False): outtopic = open(outtopic_name, 'w') outlabel = open(outlabel_name, 'w') for idx,doc in enumerate(corpus): tops = lsi[doc] doc_tops=[] for j in range(args.ntopics): search = [v[1] for v in tops if v[0] == j] if len(search)>0: doc_tops.append(search[0]) else: doc_tops.append(0.) most_important = doc_tops.index(max(doc_tops)) outlabel.write('{0}\n'.format(most_important)) outtopic.write('\t'.join([str(d) for d in doc_tops])+'\n') outlabel.close() outtopic.close() shutil.move(outlabel_name,folderpath) shutil.move(outtopic_name,folderpath) #os.chdir(rootdir) end_time=time.time() runtime=end_time-start_time print "\nRuntime: {0} seconds\n".format(runtime)
for chunksize in np.arange(10000, 10001, 10000): lsi_models[num_topics][chunksize] = {} lsi_similarity_indices[num_topics][chunksize] = {} for power_iters in np.arange(1, 2): lsi_models[num_topics][chunksize][power_iters] = {} lsi_similarity_indices[num_topics][chunksize][power_iters] = {} for onepass in np.arange(1): print('Number of topics: {}. Chunksize: {}. Number of power iterations: {}. One-pass: {}' .format(num_topics, chunksize, power_iters, bool(onepass))) lsi = LsiModel(corpus, id2word=id2token, num_topics=num_topics, chunksize=chunksize, onepass=onepass, power_iters=bool(power_iters)) lsi_models[num_topics][chunksize][power_iters][onepass] = lsi lsi_similarity_indices[num_topics][chunksize][power_iters][onepass] = similarities.MatrixSimilarity( lsi[corpus], num_features=num_topics ) run_time = int((time.time() - start_time) / 60) print('Grid search took {} minutes.'.format(run_time)) with open('lsi_models.pickle', 'wb') as f: pickle.dump(lsi_models, f) print('Models saved.')
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() # load model and corpus logger.info('loading word mapping') dictionary = Dictionary.load(path.join(result_path, p['run'], p['dict_extension'])) model_path = path.join(result_path, p['run'], p['lsi_ext']) logger.info('load model from: %s' % model_path) lsi = LsiModel.load(model_path) pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext'])) logging.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (only pre model)') corpus_pre = pre[bow_lee_texts] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file'])) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] max_topics = lsi.num_topics logger.info("iterate from %d to %d dimensions (stepsize: %d)" % (p['min_dim'], max_topics, p['dim_step'])) iter_range = range(p['min_dim'], max_topics, p['dim_step']) res = np.zeros(len(iter_range)) for k, l in enumerate(iter_range): # do the lower dimensionality transformation lsi.num_topics = l corpus_lsi = lsi[corpus_pre] # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1])) res[k] = cor[0, 1] plt.figure() plt.plot(iter_range, res) plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension'])) plt.close() np.save(path.join(output_dir, 'model_dim_res.npy'), res) dif = datetime.now() - start logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
k = 40 # wanted number of topics ### SVD DECOMPOSITION (LSA) ## ### USING GENSIM ############# ans = raw_input("Start Latent Semantic Analysis with Gensim ? ") if ans != "y": exit() from gensim.models.lsimodel import LsiModel from gensim.matutils import Sparse2Corpus, corpus2dense co = Sparse2Corpus(X, documents_columns = False) lsi = LsiModel(corpus=co, num_topics=k) list_topics = lsi.show_topics(formatted=False) topics = map(lambda li : [(value, feature_names[int(key)]) for (value, key) in li] ,list_topics) print(topics) genreMat = [] for genre in Genre.objects.all(): index = filmsbygenre[genre.name] if index != []: obj = lsi[Sparse2Corpus(X[index, :], documents_columns = False)] E = corpus2dense(obj, k).transpose() genreMat.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) ) else: genreMat.append( np.hstack([ [genre.name] , np.zeros(k) ] )) genreMat = np.vstack(genreMat)
matrices = {} logging.info('load the articles pickle') with open(results_path + "sparql_wiki.pickle", 'r') as f: articles = pickle.load(f) logging.info('load the dictionary') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('load the log_ent model') log_ent = LogEntropyModel.load(results_path + norm_model) logging.info('load the LSI model') lsi = LsiModel.load(results_path + trans_model) for key in articles.iterkeys(): logging.info('current term: %s' % key) term_list = articles[key].keys() text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) for article in articles[key].values()] sim_matrix = np.zeros((len(text_list), len(text_list))) logging.info('transform the textlist') text_list = lsi[log_ent[text_list]] logging.info('compute similarity matrix') for i, par1 in enumerate(text_list):
if formula.find('=') == -1 : print "invalid formula" else: query = ("SELECT sentence,sentence_id from sentences where sentence_id between %s and %s") cursor.execute(query,(sent_id-1,sent_id)) sent_list = cursor.fetchall() sentence ='' for sent in sent_list: sentence+=' ' sentence+=sent[0] yield dictionary.doc2bow(cleanSent(sentence).lower().split()) corpus=MyCorpus() print(dictionary) lsi = LsiModel(corpus, num_topics=50,id2word=dictionary) print(lsi[doc_tfidf]) # project some document into LSI space lsi.add_documents(corpus2) # update LSI on additional documents print(lsi[doc_tfidf]) lsi.show_topics(num_topics=-1, num_words=10, log=False, formatted=True) print lsi.projection.u # finding embeddings of valid formulae V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s import numpy as np np.asarray
from datetime import datetime from date_extractor import month_to_number from gensim.corpora import Dictionary from gensim.models.lsimodel import LsiModel from nltk.corpus import stopwords as nltk_stopwords from os.path import dirname, realpath try: path_to_directory_of_this_file = dirname(realpath(__file__)) stopwords = [] with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) lsi = LsiModel.load(path_to_directory_of_this_file + "/model") dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary") except Exception as e: print e def run(text): try: words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() words = [word for word in words if len(word) > 3 and word not in stopwords] if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities:
new_vec = dictionary.doc2bow(new_doc.lower().split()) #print(new_vec) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) print corpus # tfidf = models.TfidfModel(corpus) # vec = [(0, 1), (4, 1)] # print(tfidf[vec]) # index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12) # sims = index[tfidf[vec]] # print(list(enumerate(sims))) corpora.MmCorpus.save_corpus('file.mm', corpus) #id2word= corpora.Dictionary.load('deerwester.dict') mmCorpus = corpora.MmCorpus("file.mm") print mmCorpus lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10) print "lsi:" #print(lsi[new_vec]) lsi.print_debug(4, 4) lsi.print_topics(4,2) lsi.show_topic(10, 10) lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10) lda.print_topics(4,4) doc_lda = lda[new_vec] print "lda:" #print doc_lda # corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)], # [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
print(lda[test_doc_bow2]) !pip install pyLDAvis import pyLDAvis.gensim pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary) from gensim.models import CoherenceModel lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v') LDA_cm=lda_cm.get_coherence() LDA_cm from gensim.models.lsimodel import LsiModel lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20) lsi_topics = lsi.print_topics() for topic in lsi_topics: print(topic) test_doc = 'Journal of medicines and herbs' test_doc = custom_preprocess(test_doc) test_doc_bow = journals_dictionary.doc2bow(test_doc) print(test_doc_bow) print(lsi[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def main(): parser = ArgumentParser( description= 'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information' ) parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') parser.add_argument('--model-id', default='model', help='Filename for created model.') parser.add_argument( '--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') parser.add_argument('--n-topics', default=10, help='Number of topics to model.') parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') parser.add_argument('--index', help='Elasticsearch: index to read from.') parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') parser.add_argument( '--data-dir', help='Directory to save the generated models and vocabularies into.') parser.add_argument( '--vocab', help= 'Prebuilt Vocabulary file. Use this to avoid having to generate one.') opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ['es', 'wiki', 'file']: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ['wiki']: logging.error('--dump-file required for wiki dataset') sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == 'es' and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) if data_dir: model_fn = '%s/%s' % (data_dir, model_fn) if model_type == 'word2vec': w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == 'es': logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es) elif data_type == 'wiki': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == 'file': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words('norwegian')) if not vocab_file or model_type == 'vocabulary': vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + '.vocab') else: vocab = Dictionary.load(vocab_file) if model_type == 'vocabulary': return tfidf = TfidfModel(dictionary=vocab) if model_type == 'lsi': corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == 'lda': corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == 'word2vec': corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == 'hdp': corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) model_path = path.join(base_path, p['result_path'], p['model_label']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # train the model on the small marketing corpus preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if not p['model_label']: cor = TextFilesCorpus(path.join(base_path, p['corpus_path']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) dictionary = cor.dictionary pre = LogEntropyModel(cor, id2word=dictionary, normalize=True) lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics']) else: dictionary = Dictionary.load(path.join(model_path, p['dict_name'])) pre = SaveLoad.load(path.join(model_path, 'pre.model')) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) lsi.num_topics = p['num_topics'] test_cor_path = path.join(base_path, p['test_cor_path']) test_answers, gold_answers, ratings = [], [], [] flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt')) for file in flist: match = re.search('data3_(\d)_\d+.txt', file) ratings.append(int(match.group(1))) with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] test_answers.append(corpus) flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt')) for file in flist: with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] gold_answers.append(corpus) sim = MatrixSimilarity(test_answers)[gold_answers] mean_sim = np.mean(sim, axis=0) print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1] print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
dicto = corpora.Dictionary(texts) corpus = [dicto.doc2bow(text) for text in texts] lsi_models = {} lsi_similarity_indices = {} start_time = time.time() for chunksize in np.arange(5000, 30001, 5000): print('Chunksize: {}'.format(chunksize)) iter_start_time = time.time() lsi = LsiModel(corpus, id2word=id2token, num_topics=50, chunksize=chunksize, onepass=False, power_iters=2) lsi_models[chunksize] = lsi lsi_similarity_indices[chunksize] = similarities.MatrixSimilarity( lsi[corpus], num_features=100) print('{} seconds'.format(int(time.time() - iter_start_time))) run_time = int((time.time() - start_time) / 60) print('Parameter search took {} minutes.'.format(run_time)) with open('lsi_models_num_topics_chunksize.pickle', 'wb') as f: pickle.dump(lsi_models, f) print('Models saved.')
class TextProcessor: def __init__(self, n_users, n_samples, n_dims): self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\ conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \ conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt') def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus def lda_transform(self, corpus_tf_idf, train_separated=False, is_update=False): """ Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it. :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix. :param train_separated: The model is going to be train with all corpus one time or some of them separately one time. :param is_update: Whether the training to be perform is to construct a new model or update one existed. :return: lda corpus. """ logger.info('Training lda model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) if is_update: # A ldaModel had been trained before and now update the model with other corpus. if self.ldaModel is None: self.load_model('lda') self.ldaModel.update(corpus_tf_idf) logger.info('Lda model has been updated successfully.') return self.ldaModel[corpus_tf_idf] if train_separated: # corpus = [] # spacing = 10000 # for i in range(int(len(corpus_tf_idf)/spacing)): # corpus.append(corpus_tf_idf[i*spacing: i]) # self.ldaModel = LdaModel() pass self.ldaModel = LdaModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) conf.mk_dir(self.ldaPath) self.ldaModel.save(self.ldaPath) logger.info('lda model has been saved in %s' % self.ldaPath) lda_corpus = self.ldaModel[corpus_tf_idf] lda_corpus_path = conf.get_filename_via_tpl('lda', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lda_corpus_path) corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus) logger.info('Lda corpus with a shape of %s has been saved in %s.' % (np.array(lda_corpus).shape, lda_corpus_path)) return lda_corpus def w2v_transform(self, sentences): """ Perform word2vec on texts and obtain a w2v model. :param sentences: Sentences that each one of it contains a list of words of a text. :return: W2v model. """ logger.info('Training w2v model with a dim of %d...' % self.nDims) # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path) # sentences = [] # for sen in file.readlines(): # sentences.append(sen.strip().split(' ')) # print(sentences) self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0) conf.mk_dir(self.w2vPath) self.w2vModel.save(self.w2vPath) self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False) # print(model['[']) # Construct w2v corpus w2v_corpus = [] for sen in sentences: vec = [0] * self.nDims if len(sen) > 0: for word in sen: vec = list( map(lambda m, n: m + n, vec, self.w2vModel[word])) # vec += self.w2vModel[word] w2v_corpus.append(vec) w2v_corpus_path = conf.get_filename_via_tpl('w2v', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims) conf.mk_dir(w2v_corpus_path) with open(w2v_corpus_path, 'w') as fp: csv_writer = csv.writer(fp) for line in w2v_corpus: csv_writer.writerow(line) logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path) return w2v_corpus def load_corpus(self, model_type, dense=False): corpus = None try: if model_type == 'tfidf': corpus = corpora.MmCorpus( conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples)) elif model_type in ['lsi', 'lda']: corpus = corpora.MmCorpus( conf.get_filename_via_tpl(model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm')) elif model_type == 'w2v': corpus = np.loadtxt(conf.get_filename_via_tpl( model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims), dtype=np.float, delimiter=',') logger.info('%s corpus with a shape of %s has been loaded. ' % (model_type, np.array(corpus).shape)) if dense and model_type in ['tfidf', 'lsi', 'lda']: corpus = matutils.corpus2dense(corpus, self.nDims, self.nSamples * self.nUsers, dtype=np.float).T else: corpus = np.array(corpus) except Exception as e: raise e return corpus @staticmethod def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float): return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T def load_vec(self, vec_type): logger.info('Loading %s vectors...' % vec_type) try: corpus_vec = self.load_corpus(vec_type, True) except Exception as e: raise e data = [] for i in range(self.nUsers): data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples]) data = np.array(data, dtype=np.float) return data
punc_free = "".join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized # Creating a list of documents from the complaints column list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here # Creating the dictionary from our cleaned word list doc_clean dictionary = corpora.Dictionary(doc_clean) # Creating the corpus doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the LSi model lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) pprint(lsimodel.print_topics()) # -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit,
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)