def load(lsi_path=None, id2word_path=None, index_path=None): """ If specified, attempts to load gensim LsiModel from `lsi_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- lsi_path: str File-path designating where self.model should be saved. id2word_path: str File-path designating where self.dictionary should be saved. """ if lsi_path is not None: from gensim.models import LsiModel if not os.path.exists(lsi_path): raise IOError( 'The provided file path to the LsiModel was not found.' 'Please ensure that the argument is the correct path.') return LsiModel.load(lsi_path) if id2word_path is not None: from gensim.corpora.dictionary import Dictionary if not os.path.exists(id2word_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return Dictionary.load(id2word_path) if index_path is not None: from gensim.similarities import MatrixSimilarity if not os.path.exists(index_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return MatrixSimilarity.load(index_path)
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel( corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
def train_models(): models = dict() if settings["models"]["msda"]: dims = settings["dimensionalities"]["msda"] try: msda = mSDA.load("reuters_msda_%sdims" % dims) # the line below is for testing a model I have locally on my machine #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3") except: ln.info("Training mSDA...") prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]] msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids) msda.train(bow_corpus()) msda.save("reuters_msda_%sdims" % dims) msda.__out_size = dims models["msda"] = msda if settings["models"]["lsi"]: dims = settings["dimensionalities"]["lsi"] try: lsi = LsiModel.load("reuters_lsi_%sdims" % dims) except: ln.info("Training LSI...") lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary) lsi.save("reuters_lsi_%sdims" % dims) lsi.__out_size = dims models["lsi"] = lsi return models
def __init__(self, dictionary_path, corpus_path, tfidf_path, corpus_tfidf_path, tfidf_index_sim_path, lsi_path, lsi_index_path, stopwords_path, tweet_corpus_path): self.dictionary = gensim.corpora.Dictionary.load(dictionary_path) self.corpus = MmCorpusMeta(corpus_path, id2word=self.dictionary, metadata=True) self.tweet_corpus = MmCorpusMeta(tweet_corpus_path, id2word=self.dictionary, metadata=True) self.tfidf = TfidfModel.load(tfidf_path) self.corpus_tfidf = gensim.utils.unpickle(corpus_tfidf_path) self.tfidf_index = gensim.similarities.MatrixSimilarity.load( tfidf_index_sim_path) self.lsi = LsiModel.load(lsi_path) self.lsi_index = gensim.similarities.MatrixSimilarity.load( lsi_index_path) with open(stopwords_path) as f: self.stopwords = json.load(f) self.tdidf_tweets = self.tfidf[self.tweet_corpus] self.lsi_tweets = self.lsi[self.tdidf_tweets] self.sim_tweets = gensim.similarities.MatrixSimilarity(self.lsi_tweets) print("loaded")
def load_and_cluster(): corpus_tfidf, dictionary, titles = train_tfidf_model() # fname = save_lsi_model(corpus_tfidf,dictionary) print("fname") fname = "/var/folders/ft/jlv83lxd58zb3v6bjqtzlr0c0000gn/T/lsi.model" print(fname) lsi = LsiModel.load(fname) print("lsi corpus") corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi X = [[row[1] for row in document] for document in corpus_lsi] print("Kmeans") print([row for row in X if len(row) != 3000]) np_array = np.array([np.array(row) for row in X if len(row) == 3000]) print(type(np_array)) print(type(np_array[0])) print(np.unique([len(row) for row in np_array])) kmeans_clustering(np_array, 1000) numeric_labels = [] with open("pickle_model.pkl", 'rb') as file: kmeans_model = pickle.load(file) cluster_centers = kmeans_model.cluster_centers_ numeric_labels = kmeans_model.labels_ #, model.predict(X), model.labels_ print(np.unique(numeric_labels)) for i in range(1000): if i % 100 == 0: print(i) for j, label in enumerate(numeric_labels): if label == i: print(titles[j])
def train(self, data): """ Fit LSA model to the data, set document topic vectors and calculate distances. :param data: Data to fit model on """ if self.word_dict == None: print( "Dictionary must be assigned to model before training. This function call does nothing" ) return if self.model == None: self.model = LsiModel(num_topics=self.vector_length, id2word=self.word_dict) self.name = '%s_%strain' % (self.name, data.name) self.path = Path('modelfiles/%s/%s' % (data.name, self.name)) try: self.model = LsiModel.load(str(self.path / '.model')) except: self.path.mkdir(parents=True, exist_ok=True) print("Training model...", end='') time.sleep(0.1) datastream = GetBow(data, self.remove_stopwords, self.word_dict) self.model.add_documents(datastream) self.model.save(str(self.path / '.model'))
def __init__(self, docs, num_topics=500, chunksize=20000, no_below=50, no_above=0.5, tfidf=True, model_path="./lsi_data"): # Set training parameters. self.num_topics = num_topics self.chunksize = chunksize self.no_below = no_below self.no_above = no_above self.tfidf = tfidf self.model_path = model_path if not os.path.exists(model_path): os.makedirs(model_path) index_path = './data.index' if os.path.exists(index_path): assert os.path.exists("./corpus_bow") and os.path.exists(os.path.join("./corpus_tfidf")),\ "Corpus file missing! Please rebuild index." with open(index_path, "rb") as reader: index = pkl.load(reader) self.index = index["index"] self.index2docid = index["index2docid"] with open("./corpus_bow", "rb") as reader: self.corpus_bow = pkl.load(reader) with open("./corpus_tfidf", "rb") as reader: self.corpus_tfidf = pkl.load(reader) if os.path.exists(os.path.join(self.model_path, "lsi.model")): self.model = LsiModel.load(os.path.join(self.model_path, "lsi.model")) else: self.model = self.train() else: self.rebuild_index(docs, index_path)
def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ if lsa_training: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) # generate LSA model lsi_model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # train model #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence() #print("Coherence value : ",coherence_value) print('Saving lsi_model...') lsi_model.save(lsi_model_path) print('lsi_model saved!') corpus_lsi = lsi_model[doc_term_matrix] with open(corupus_lsi_path, 'wb') as handle: pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Corpus_lsi saved.') else: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) print('Loading lsi_model...') lsi_model = LsiModel.load(lsi_model_path) print('lsi_model Loaded!') corpus_lsi = lsi_model[doc_term_matrix] return lsi_model, corpus_lsi, dictionary
def lsi(dataframe, num_topics=300): """Returns an LSI model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. num_topics : int (default is 300) The number of topics to train the LSI model with. Returns ------- model : Gensim LsiModel LSI model for documents stored in the DataFrame. """ filename = 'caches/models/lsi.model' if not os.path.isfile(filename): dictionary = dictionary_corpus(dataframe) bow = bow_corpus(dataframe) tfidf_model = tfidf(dataframe) tfidf_corpus = tfidf_model[bow] lsi_model = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=num_topics) lsi_model.save(filename) else: lsi_model = LsiModel.load(filename) return lsi_model
def __init__(self, dict_path, model_path): """Load an LSA space from a file. :dict_path: path to the dictionary file. :model_path: path to the model file. """ self._dictionary = Dictionary.load_from_text(dict_path) self._lsi_model = LsiModel.load(model_path)
def train_tsne(training_size=2000, metric='cosine', n_components=3, perplexity=100, angle=.12): # adjust this downward to see it it affects accuracy np = pd.np tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz')) tweets = tweets[tweets.isbot >= 0] gc.collect() # reclaim RAM released above # labels3 = tweets.isbot.apply(lambda x: int(x * 3)) labels = tweets.isbot.apply(lambda x: int(x * 2)) lsa = LsiModel.load( os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl')) tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word) bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text]) # tfidfs = tfidf[bows] X = pd.DataFrame( [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))], index=tweets.index) mask = ~X.isnull().any(axis=1) mask.index = tweets.index # >>> sum(~mask) # 99 # >>> tweets.loc[mask.argmin()] # isbot 0.17 # strict 13 # user b'CrisParanoid:' # text b'#sad again' # Name: 571, dtype: object X = X[mask] y = tweets.isbot[mask] labels = labels[mask] test_size = 1.0 - training_size if training_size < 1 else float( len(X) - training_size) / len(X) Xindex, Xindex_test, yindex, yindex_test = train_test_split( X.index.values, y.index.values, test_size=test_size) X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[ yindex], y.loc[yindex_test] # labels_test = labels.loc[yindex_test] labels = labels.loc[yindex] tsne = TSNE(metric='precomputed', n_components=n_components, angle=angle, perplexity=perplexity) tsne = tsne.fit(positive_distances(X.values, metric=metric)) return tsne, X, Xtest, y, ytest
def get_model(self, num_topics): tmp_fname = self.path + self.model_type + "_model" if os.path.exists(tmp_fname): return LsiModel.load(tmp_fname) else: print("Training model.") return self.train_model(num_topics)
def fit(self, raw_documents, y=None): self.analyzer_func = self.build_analyzer() self.model = LsiModel.load(self.model_fn) if os.path.exists(self.model_fn + '.tfidf'): self.tfidf = TfidfModel.load(self.model_fn + '.tfidf') return self
def load_corpus(): dictionary = corpora.Dictionary.load(os.path.join(HERE, "sogou.dict")) tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, "sogou.model")) lsi_model = LsiModel.load(os.path.join(HERE, "sogou.lsi")) try: sg_class = joblib.load(os.path.join(HERE, "sgdc_clf.pkl")) except: sg_class = None return dictionary, tfidf_model, lsi_model, sg_class
def load_corpus(): dictionary = corpora.Dictionary.load(os.path.join(HERE, 'sogou.dict')) tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, 'sogou.model')) lsi_model = LsiModel.load(os.path.join(HERE, 'sogou.lsi')) try: sg_class = joblib.load(os.path.join(HERE, 'sgdc_clf.pkl')) except: sg_class = None return dictionary, tfidf_model, lsi_model, sg_class
def __init__(self, model, namespace2idx): if isinstance(namespace2idx, str): idx2namespace, namespace2idx = utils.read_vocab(namespace2idx) if isinstance(model, str): from gensim.models import LsiModel model = LsiModel.load(model) self.vocab = pd.Series(namespace2idx).sort_values() self.weights = pd.DataFrame(model.projection.u, index=self.vocab.index) super(LSI, self).__init__(model)
def getLsiModel(tfidfModel) -> LsiModel: modelPath = os.path.join('.cache', 'lsi.gensim_model') try: lsiModel = LsiModel.load(modelPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) lsiModel = LsiModel(corpus, num_topics=200) lsiModel.save(modelPath) return lsiModel
def _load_model(type, fname='../../model/'): try: if type == 'lsi': return LsiModel.load(fname) elif type == 'lda': return LdaModel.load(fname) elif type == 'mallet': return LdaMallet.load(fname) except: return None
def load_topic_model(vec_method, model_path, index_path, dict_path, corpus_path): if vec_method == 'LDA': model = LdaModel.load(model_path) elif vec_method == 'LSI': model = LsiModel.load(model_path) index = similarities.MatrixSimilarity.load(index_path) dictionary = corpora.Dictionary.load(dict_path) corpus = corpora.MmCorpus(corpus_path) # vec_lda = lda[corpus] return model, index, dictionary, corpus
def load_corpus(self, corpus_name): ''' This is were we load the corpus files. This needs to be moved to a more general class initialization. (FIXME Freija) ''' corpusfile = corpus_name + '.mm' corpusdict = corpus_name + '_wordids.txt' lsimodel = corpus_name + '.lsi_model' lsiindex = corpus_name + '-lsi.index' self.corpus_name = corpus_name self.corpus_mm = MmCorpus(corpusfile) self.corpus_dict = Dictionary.load_from_text(corpusdict) self.model = LsiModel.load(lsimodel) self.index = similarities.MatrixSimilarity.load(lsiindex)
def export_model(model_file, out_file): """Saves the model. The output will be utf-8 encoded.""" # model = model_mapping[model_type].load(model_file) model = LsiModel.load(model_file) with FileWriter(out_file, "w").open() as out: out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics)) for term in xrange(model.numTerms): word = model.id2word.id2token[term].decode("utf-8") while len(word) > 0 and not word[-1].isalnum(): word = word[0:-1] out.write(u"{0}\n".format(word)) out.write( u"{0}\n".format(u"\t".join(str(f) for f in numpy.asarray(model.projection.u.T[:, term]).flatten())) )
def _load_model(model_type, fname): logger.info(f'{model_type} type of {fname} is loading..') try: if model_type == 'lsi': return LsiModel.load(f'../model/lsi_model/{fname}') elif model_type == 'lda': return LdaModel.load(f'../model/lda_model/{fname}') elif model_type == 'mallet': return LdaMallet.load(f'../model/mallet_model/{fname}') elif model_type == 'hdp': return HdpModel.load(f'../model/mallet_model/{fname}') except Exception as ex: logger.warning(f'{model_type} type of {fname} could not be loaded.', exc_info=ex) return None
def export_model(model_file, out_file): """Saves the model. The output will be utf-8 encoded.""" # model = model_mapping[model_type].load(model_file) model = LsiModel.load(model_file) with FileWriter(out_file, 'w').open() as out: out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics)) for term in xrange(model.numTerms): word = model.id2word.id2token[term].decode("utf-8") while len(word) > 0 and not word[-1].isalnum(): word = word[0:-1] out.write(u"{0}\n".format(word)) out.write(u"{0}\n".format(u"\t".join( str(f) for f in numpy.asarray(model.projection.u.T[:, term]).flatten())))
def get_lsi_model(doc_term_matrix, id2word, fname): if fname is not None: try: return LsiModel.load(fname) except: pass lsi_model = LsiModel(corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics'], chunksize=params['chunksize']) _save_model(lsi_model, fname) return lsi_model
def _load_model(self, param_id, nb_topics): """ Load an LDA model. """ if self.lsi: model_dir = join(LSI_PATH, self.version, self.corpus_type) model_file = f'{self.dataset}_LSImodel_{nb_topics}' model_path = join(model_dir, model_file) model = LsiModel.load(model_path) else: model_dir = join(self.directory, self.corpus_type, param_id) model_file = f'{self.dataset}_LDAmodel_{param_id}_{nb_topics}_{self.epochs}' model_path = join(model_dir, model_file) model = LdaModel.load(model_path) self.logg(f'Loading model from {model_path}') return model
def __init__(self, series, dictionary, lsi, index, sim_opt, rank_opt): super().__init__() self.norm = LookupNormalization() self.dictionary: Dictionary = Dictionary.load(dictionary) self.lsi: LsiModel = LsiModel.load(lsi) self.index: MatrixSimilarity = MatrixSimilarity.load(index) sr = SerializationReader(series) self.documents, self.doc2idx, self.idx2doc = sr.read() sim_class = globals()[self.SIM_OPTS[sim_opt]["cls"]] self.sim_strategy: SimilarityStrategy = sim_class(self.SIM_OPTS[sim_opt]["constant"]) rank_class = globals()[self.RANK_OPTS[rank_opt]] self.rank_strategy: RankingStrategy = rank_class()
def stacking(text, results, infos): integrated = copy.deepcopy(results[0]) d2v = TfidfModel.load('./modules/models/tfidf.model') dct = Dictionary.load('./modules/models/dic.model') lsi = LsiModel.load('./modules/models/lsi.model') # generate feature vector text = u' '.join(unicode(text)) word_list = text.split() corpus = dct.doc2bow(word_list) sent_feature = [item[1] for item in lsi[d2v[corpus]]] x = list() x += sent_feature for info in infos: for result in results: x.append(len(unicode(result['%s' % info]))/10.0) try: pos = result['%s_p' % info][0] except IndexError: pos = 0 try: x.append(pos/float(len(unicode(text)))) except ZeroDivisionError: x.append(0) x.append(result['%s_confidence' % info]) # predict every type of info. for info in infos: probs = list() for i in range(len(results)): model = joblib.load('./modules/models/integrator_%s%s.model' % (info, i)) # print model.predict_proba(x) probs.append(model.predict_proba([x])[0][1]) y = probs.index(max(probs)) integrated[info] = results[y][info] integrated['%s_p' % info] = results[y]['%s_p' % info] conf = 1.0 for result in results: conf *= (1 - result['%s_confidence' % info]) integrated['%s_confidence' % info] = 1 - conf return integrated
def train_lda(training_size=2000, metric='cosine'): tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz')) tweets = tweets[tweets.isbot >= 0] # labels3 = tweets.isbot.apply(lambda x: int(x * 3)) labels = tweets.isbot.apply(lambda x: int(x * 2)) lsa = LsiModel.load( os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl')) tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word) bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text]) # tfidfs = tfidf[bows] X = pd.DataFrame( [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))], index=tweets.index) mask = ~X.isnull().any(axis=1) mask.index = tweets.index X = X[mask] y = tweets.isbot[mask] labels = labels[mask] # labels3 = labels3[mask] test_size = 1.0 - training_size if training_size < 1 else float( len(X) - training_size) / len(X) Xindex, Xindex_test, yindex, yindex_test = train_test_split( X.index.values, y.index.values, test_size=test_size) X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[ yindex], y.loc[yindex_test] labels_test = labels.loc[yindex_test] labels = labels.loc[yindex] lda = LDA('lsqr', 'auto', n_components=3) print(cross_val_score(lda, Xtest, labels_test, cv=7)) lda = LDA('lsqr', 'auto', n_components=3) lda = lda.fit(X.values, labels.values) y_lda = lda.predict(Xtest) print(mean_squared_error(y_lda, ytest)) df_test = pd.DataFrame(lda.predict(Xtest), index=Xtest.index, columns=['predict']) df_test['truth'] = labels_test return lda, df_test
def __init__(self, name, model, components=None): if name == "lsa": self.vsm = LsiModel.load(model) self.vocab = self.vsm.id2word.token2id self.vector_size = self.vsm.num_topics elif name == "w2v": self.vsm = keyedvectors.KeyedVectors.load_word2vec_format(model, binary=True, unicode_errors='ignore') self.vocab = self.vsm.vocab self.vector_size = self.vsm.syn0.shape[1] # https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/keyedvectors.py elif name == "pickle": vsm_obj = pickle.load(open(model, "rb")) self.vsm = vsm_obj["vsm"] self.vocab = vsm_obj["map"] self.vector_size = self.vsm.shape[1] try: self.components = load(components) except (IOError, AttributeError): self.components = 1
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel(corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000): self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics) self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics) self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics) self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics) self.embedding = embedding self.corpus = corpus self.num_topics = num_topics if os.path.exists(self.lsi_model_path): print("LSI {} model already trained, loading from disk.".format(embedding)) self.model = LsiModel.load(self.lsi_model_path) else: # Make a index to word dictionary. temp = corpus.dictionary[0] # This is only to "load" the dictionary. id2word = corpus.dictionary.id2token print("Training LSI model.") self.model = LsiModel( corpus=list(corpus.get_corpus()), id2word=id2word, chunksize=chunksize, num_topics=num_topics ) print("Saving LSI model.") self.model.save(self.lsi_model_path) self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path) if os.path.exists(self.sim_matrix_path): print("Similarities matrix {} model already trained, loading from disk.".format(embedding)) self.index = similarities.Similarity.load(self.sim_matrix_path) else: print("Creating similarities index.") Path(self.sim_matrix_temp_path).touch(exist_ok=True) self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics) self.index.save(self.sim_matrix_path)
def load(cls, save_dir='./'): """ Load a SimSearch object and it's underlying KeySearch from the specified directory. Returns both objects. """ # First create and load the underlying KeySearch. ksearch = KeySearch.load(save_dir) # Create a SimSearch object. ssearch = SimSearch(ksearch) # Load the LSI index. ssearch.index = similarities.MatrixSimilarity.load(save_dir + 'index.mm') # Load the LSI model. ssearch.lsi = LsiModel.load(save_dir + 'lsi.model') return (ksearch, ssearch)
def train_models(): models = dict() if settings["models"]["msda"]: dims = settings["dimensionalities"]["msda"] try: msda = mSDA.load("reuters_msda_%sdims" % dims) # the line below is for testing a model I have locally on my machine #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3") except: ln.info("Training mSDA...") prototype_ids = [ id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims] ] msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids) msda.train(bow_corpus()) msda.save("reuters_msda_%sdims" % dims) msda.__out_size = dims models["msda"] = msda if settings["models"]["lsi"]: dims = settings["dimensionalities"]["lsi"] try: lsi = LsiModel.load("reuters_lsi_%sdims" % dims) except: ln.info("Training LSI...") lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary) lsi.save("reuters_lsi_%sdims" % dims) lsi.__out_size = dims models["lsi"] = lsi return models
def train_lsi(corpus, dictionary, num_topics, corpus_type): """ Train the LSI model given the dataset for a given amount of topics. """ #train model and save for later use model_filename = 'lsi_' + str(corpus_type) + '_num_topics=' + str( num_topics) + '.model' model_path = './tmp/' + model_filename if not os.path.exists(model_path): print(('Starting training {} lsi for num_topics = {}').format( corpus_type, num_topics)) lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, onepass=False) lsi.save(model_path) else: print(('{} Lsi for num_topics = {} is already created, loading now...' ).format(corpus_type, num_topics)) lsi = LsiModel.load(model_path) #construct BOW index for trained lsi model, save for later use index_filename = 'index_' + str(corpus_type) + '_num_topics=' + str( num_topics) + '.mm.index' index_path = './tmp/' + index_filename if not os.path.exists(index_path): print(('Starting construction {} index for num_topics = {}').format( corpus_type, num_topics)) index = similarities.MatrixSimilarity(lsi[corpus]) index.save(index_path) else: print(( 'index for {} corpus with num_topics = {} is already created, loading now...' ).format(corpus_type, num_topics)) index = similarities.MatrixSimilarity.load(index_path) return lsi, index
def load(model_file): """load the lsi model into memory""" lsi = LsiModel.load(model_file) return lsi;
pd.set_option('display.max_columns', 500) pd.set_option('display.width', 800) pd.set_option('precision', 2) get_ipython().magic(u'precision 4') get_ipython().magic(u'pprint') # In[3]: from sklearn.linear_model import SGDRegressor from sklearn.svm import SVR # In[6]: lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi100')) lsi2 = LsiModel.load(os.path.join(DATA_PATH, 'lsi2')) # In[7]: with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'rb') as f: topics = pd.DataFrame.from_csv(f, encoding='utf8') topics = topics.fillna(0) # In[8]: dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python') nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
from gensim.matutils import cossim from gensim.models import LsiModel logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model") parser.add_argument("-d", "--data", help="path to training.tsv") args = parser.parse_args() # Load model # Note - model contains dictionary that intentionally omits stopwords model = LsiModel.load(args.model, mmap='r') # Load 'training' data training_data = open(args.data) training_data.readline() # advance past header line correct = 0 total = 0 for line in training_data: elements = line.split("\t") question_id = elements.pop(0) correct_answer = elements.pop(1) # Get bag-of-words representation of question and answers doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements]
corpora.MmCorpus.serialize(corpus_filename, corpus) else: corpus = corpora.MmCorpus(corpus_filename) # vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts print("lsi") lsi_filename = 'model.lsi' if not os.path.isfile(lsi_filename): lsi = LsiModel(corpus, id2word=dictionary, num_topics=5) # initialize an LSI transformation, 5 topicos # lsi.save(lsi_filename) # same for tfidf, lda, ... else: lsi = LsiModel.load(lsi_filename) lsi_topics = 5 # numero predefinido de topicos def print_topic(lsi, topicno, topn=7): """ Return a single topic as a formatted string. See `show_topic()` for parameters. >>> lsimodel.print_topic(topicno, topn) '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' """ return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)]) def show_topic(lsi, topicno, topn=7): """
tfidf[bows[0]] # In[19]: dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]]) # Notice how "you" didn't get as much weight as "enjoy" # Let's look at some other tweets # In[9]: from gensim.models import LsiModel lsi = LsiModel.load('../../data/lsi100') len(lsi.id2word) # This is starting to look a lot like a set of vectors that we could use as features # But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have? # In[ ]: len(vocab) # 100k dimensions isn't a good idea # Even for a masively parallel deep learning project this would be big # Like the cat/dog picture classification on 256x256 images # What about PCA (Principal Component Analysis) like is used on images?