def train_model(self): """ Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity :return: status of training """ try: data = pd.read_csv(self.processed_data) del data['Unnamed: 0'] # creating tokens for the doc column corpus = data['doc'].map(break_to_tokens) # creating dictionary of words in the movie dataset dictionary = gensim.corpora.Dictionary(corpus) dictionary.save(self.corpus_dictionary) # creating vector with bag of words for the corpus vector = [dictionary.doc2bow(d) for d in corpus] # creating tfidf values for the vector tfidf = models.TfidfModel(vector) tfidf.save(self.tfidf_model) corpus_tfidf = tfidf[vector] # Compute Similarities similarity = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary)) similarity.save(self.matrix_similarity) return "Model Trained Successfully" except: return "Error While Training Model"
def calAuthorSim(): conn = sqlite3.connect(config.db_path) db = conn.cursor() model = AuthorTopicModel.load(config.author_model128_path) poets = list(model.id2author.values()) print(len(poets)) # vec = model.get_author_topics('苏轼') index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30) index.save(config.author_simMatrix_path) # index = MatrixSimilarity.load(config.author_simMatrix_path) for name in poets: # print(name) sims = index[model[name]] sims = sorted(sims, key=lambda item: -item[1]) sims = [ [poets[sim[0]] , sim[1]] for sim in sims] # print(sims) # sql_comment = "UPDATE author SET sims=? WHERE id=?" # db.execute(sql_comment, (toJson(sims), name)) sql_comment = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name) db.execute(sql_comment) # print(sql_comment) # print(len(poets)) conn.commit()
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidfmodel = TfidfModel(corpus) index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary)) index.save(matrix_name) tfidfmodel.save(model_name) dictionary.save(dic_name) return tfidfmodel, index, dictionary
def main(self): print("Recommendation using TF_IDF") # Loading preprocessed data vagas_ti = pd.read_csv(self.dataPrepFile) vagas_ids = pickle.load( open(self.out + "preprocessing/vagas_ids.array", "rb")) vagas_words = pickle.load( open(self.out + "preprocessing/vagas_words.list", "rb")) cvs_words = pickle.load( open(self.out + "preprocessing/cvs_words.series", "rb")) cvs = pd.read_csv(self.dataCvsFile) cvs = cvs.fillna("") cvs.isnull().any() #print("Loading cvs done!") # Creating a dictionary dictionary = gcorp.Dictionary(vagas_words) dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict' ) # store the dictionary, for future reference # compile corpus (vectors number of times each elements appears) raw_corpus = [dictionary.doc2bow(v) for v in vagas_words] gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm', raw_corpus) # store to disk print("Tamanho do dicionário: " + str(len(dictionary))) # STEP 2 : similarity between corpuses dictionary = gcorp.Dictionary.load(self.out + 'preprocessing/tf_idf/vagas.dict') corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm') # Transform Text with TF-IDF tfidf = gsm.TfidfModel(corpus) # step 1 -- initialize a model # corpus tf-idf corpus_tfidf = tfidf[corpus] # STEP 3 : Create similarity matrix of all files index = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary), num_best=10) index.save(self.out + 'preprocessing/tf_idf/vagas.index') index = MatrixSimilarity.load(self.out + 'preprocessing/tf_idf/vagas.index') self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words, dictionary, tfidf, index) print("Recommendation using TF_IDF done!")
def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_index(lsa.num_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No similarity index file exists but from_scratch is False') print('Building index...') index = MatrixSimilarity(lsa[bow_corpus]) index.save(filepath) else: print('Loading index...') index = MatrixSimilarity.load(filepath) return index
def train(self): print("Reading serializations...") sr = SerializationReader(self.series) documents, doc2idx, idx2doc = sr.read() print("Building dictionary...") dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] print("Building model...") lsi = LsiModel(corpus, id2word=dictionary, num_topics=self.dimensions) print("Building index...") index = MatrixSimilarity(lsi[corpus]) print("Saving...") dictionary.save(self.dictionary) lsi.save(self.lsi) index.save(self.index)
class GensimTopicModeler(LatentTopicModeler): """ This class facilitates the creation of topic models (options: LDA (latent Dirichlet Allocation), LSI (latent semantic indexing), and Random Projections with the given short text training data, and convert future short text into topic vectors using the trained topic model. This class extends :class:`LatentTopicModeler`. """ def __init__(self, preprocessor=textpreprocess.standard_text_preprocessor_1(), algorithm='lda', toweigh=True, normalize=True): """ Initialize the topic modeler. :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) :param toweigh: whether to weigh the words using tf-idf. (Default: True) :param normalize: whether the retrieved topic vectors are normalized. (Default: True) :type preprocessor: function :type algorithm: str :type toweigh: bool """ LatentTopicModeler.__init__(self, preprocessor=preprocessor, normalize=normalize) self.algorithm = algorithm self.toweigh = toweigh def train(self, classdict, nb_topics, *args, **kwargs): """ Train the topic modeler. :param classdict: training data :param nb_topics: number of latent topics :param args: arguments to pass to the `train` method for gensim topic models :param kwargs: arguments to pass to the `train` method for gensim topic models :return: None :type classdict: dict :type nb_topics: int """ self.nb_topics = nb_topics self.generate_corpus(classdict) if self.toweigh: self.tfidf = TfidfModel(self.corpus) normcorpus = self.tfidf[self.corpus] else: self.tfidf = None normcorpus = self.corpus self.topicmodel = gensim_topic_model_dict[self.algorithm]( normcorpus, num_topics=self.nb_topics, *args, **kwargs) self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) # change the flag self.trained = True def retrieve_corpus_topicdist(self, shorttext): """ Calculate the topic vector representation of the short text, in the corpus form. If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. :param shorttext: text to be represented :return: topic vector in the corpus form :raise: ModelNotTrainedException :type shorttext: str :rtype: list """ if not self.trained: raise e.ModelNotTrainedException() bow = self.retrieve_bow(shorttext) return self.topicmodel[self.tfidf[bow] if self.toweigh else bow] def retrieve_topicvec(self, shorttext): """ Calculate the topic vector representation of the short text. This function calls :func:`~retrieve_corpus_topicdist`. If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. :param shorttext: text to be represented :return: topic vector :raise: ModelNotTrainedException :type shorttext: str :rtype: numpy.ndarray """ if not self.trained: raise e.ModelNotTrainedException() topicdist = self.retrieve_corpus_topicdist(shorttext) topicvec = np.zeros(self.nb_topics) for topicid, frac in topicdist: topicvec[topicid] = frac if self.normalize: topicvec /= np.linalg.norm(topicvec) return topicvec def get_batch_cos_similarities(self, shorttext): """ Calculate the score, which is the cosine similarity with the topic vector of the model, of the short text against each class labels. If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. :param shorttext: short text :return: dictionary of scores of the text to all classes :raise: ModelNotTrainedException :type shorttext: str :rtype: dict """ if not self.trained: raise e.ModelNotTrainedException() simdict = {} similarities = self.matsim[self.retrieve_corpus_topicdist(shorttext)] for label, similarity in zip(self.classlabels, similarities): simdict[label] = similarity return simdict def loadmodel(self, nameprefix): """ Load the topic model with the given prefix of the file paths. Given the prefix of the file paths, load the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :return: None :type nameprefix: str """ # load the JSON file (parameters) parameters = json.load(open(nameprefix + '.json', 'rb')) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] self.classlabels = parameters['classlabels'] # load the dictionary self.dictionary = Dictionary.load(nameprefix + '.gensimdict') # load the topic model self.topicmodel = gensim_topic_model_dict[self.algorithm].load( nameprefix + '.gensimmodel') # load the similarity matrix self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat') # load the tf-idf modek if self.toweigh: self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf') # flag self.trained = True def savemodel(self, nameprefix): """ Save the model with names according to the prefix. Given the prefix of the file paths, save the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. :param nameprefix: prefix of the file paths :return: None :raise: ModelNotTrainedException :type nameprefix: str """ if not self.trained: raise e.ModelNotTrainedException() parameters = {} parameters['nb_topics'] = self.nb_topics parameters['toweigh'] = self.toweigh parameters['algorithm'] = self.algorithm parameters['classlabels'] = self.classlabels json.dump(parameters, open(nameprefix + '.json', 'wb')) self.dictionary.save(nameprefix + '.gensimdict') self.topicmodel.save(nameprefix + '.gensimmodel') self.matsim.save(nameprefix + '.gensimmat') if self.toweigh: self.tfidf.save(nameprefix + '.gensimtfidf')
] doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) matrix_sim_loc = os.path.join( settings.PERSIST_DIR, 'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix)) if not os.path.exists(matrix_sim_loc): logger.info('building matrix similarity') doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') doc_topic.save(matrix_sim_loc) else: logger.info('matrix similarity already available. using that') doc_topic = MatrixSimilarity.load(matrix_sim_loc) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000,
# stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word=dictionary, passes=20) index = MatrixSimilarity(ldamodel[corpus]) index.save("simIndex.index") print(ldamodel.print_topics(num_topics=30, num_words=2)) doc = stories['cast56'] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = ldamodel[vec_bow] sims = index[vec_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims
# from src.engine.preprocess import preprocess_body_lda # query = preprocess_body_lda(query) # corpus_query = [dictionary.doc2bow(query.split(" "))] # transformed = tfidf[corpus_query] # # logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True) # logentropy.save(settings.LOGENTROPY_MODEL) # logentropy_query = logentropy[transformed] lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000) lsi.save(settings.LDA_MODEL) lsi = models.LdaModel.load(settings.LDA_MODEL) from gensim.similarities import MatrixSimilarity similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100) similarity_matrix.save(settings.SIMILARITY_MATRIX) # similarities = similarity_matrix.get_similarities(lsi[logentropy_query]) # # # # lsi_query = lsi[logentropy_query] from gensim import matutils # matutils.cossim(lsi.) # passes = 1, per = 11000; alpha='auto', per=9200 # passes = 2, per = 5100; alpha='auto', per=3200
import codecs from gensim import corpora from gensim.similarities import MatrixSimilarity import utils # for real file path huffPostDataFilePath = '../../lda-ner-result-data/rawHuffPostData.json' gensimDictionaryBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimDictionary' gensimCorpusBaseFilePath = '../../5w1h-result-data/gensim-in-time/gensimCorpus' writingSimilarityIndexBaseFilePath = '../../5w1h-result-data/gensim-in-time/similarityIndex' huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig')) year_months = utils.make_year_months_from_huff_post_data(huffPostData) print('year_months', year_months) # read dictionary and corpus in times for year_month in year_months: dictionary_in_time_file_path = gensimDictionaryBaseFilePath + '_' + year_month + '.dict' corpus_in_time_file_path = gensimCorpusBaseFilePath + '_' + year_month + '.mm' dictionary = corpora.Dictionary.load(dictionary_in_time_file_path) corpus = corpora.MmCorpus(corpus_in_time_file_path) # make similarityIndex similarityIndex = MatrixSimilarity(corpus, num_best=10, num_features=len(dictionary)) similarityIndex.save(writingSimilarityIndexBaseFilePath + '_' + year_month)
import gensim from gensim.similarities import Similarity, MatrixSimilarity # from pgfin_timing import Timer from pgfin_helpers import tokenize logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore # load the corpora print "\n Loading corpora.\n" # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm') # lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm') # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm') lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm') # print(tfidf_corpus) # print(lsi_corpus) print "\n Start similarity index.\n" index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms) index.save('./data/pgfin_index.index') # save to disk # print index index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms) index_dense.save('./data/pgfin_matrixindex.index') # save to disk # print index_dense
# remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20) index = MatrixSimilarity(ldamodel[corpus]) index.save("simIndex.index") print(ldamodel.print_topics(num_topics=30, num_words=2)) doc = stories['cast56'] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = ldamodel[vec_bow] sims = index[vec_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims
# logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True) # logentropy.save(settings.LOGENTROPY_MODEL) # logentropy_query = logentropy[transformed] lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000) lsi.save(settings.LDA_MODEL) lsi = models.LdaModel.load(settings.LDA_MODEL) from gensim.similarities import MatrixSimilarity similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100) similarity_matrix.save(settings.SIMILARITY_MATRIX) # similarities = similarity_matrix.get_similarities(lsi[logentropy_query]) # # # # lsi_query = lsi[logentropy_query] from gensim import matutils # matutils.cossim(lsi.) # passes = 1, per = 11000; alpha='auto', per=9200 # passes = 2, per = 5100; alpha='auto', per=3200 # passes = 3. per = 4400; alpha='auto', per=2000
# tokenize to vectors corpus = [dictionary.doc2bow(text) for text in texts] # MmCorpus.serialize('./gen_sim_corpus.mm', corpus) from gensim.models import TfidfModel from gensim.models import LsiModel from gensim.similarities import MatrixSimilarity lsi = LsiModel(corpus, id2word=dictionary) corpus_lsi = lsi[corpus] lsi.save('gensim_lsi_model.lsi') # transform corpus to LSI space and index it index = MatrixSimilarity(corpus_lsi) index.save('./gensim_lsi_matrix_similarity.index') query = documents[0] query_vec = dictionary.doc2bow(query.lower().split()) # convert the query to LSI space vec_lsi = lsi[query_vec] # perform a similarity query against the corpus sims = index[vec_lsi] sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True) # sorted (document number, similarity score) 2-tuples print('\n') print('Printing first 10') real_documents = bugs.subject c = 0 for item in sims_s:
tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] corpora.MmCorpus.serialize('corpus.mm', corpus_tfidf) tfidf.save("my_model.tfidf") tfidf = models.TfidfModel.load("my_model.tfidf") print('Building LsiModel...') corpus_tfidf = corpora.MmCorpus('corpus.mm') lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print('Building MatrixSimilarity...') from gensim.similarities import MatrixSimilarity index = MatrixSimilarity(lsi[corpus_tfidf]) index.save('deerwester.index') index = MatrixSimilarity.load('deerwester.index') print('Testing...') result = np.zeros((20, 300)).astype('str') j = 0 for doc in query_test['Query']: doc = jieba.cut(doc) tokens = [] for word in doc: tokens.append(word) vec_bow = dictionary.doc2bow(tokens) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1])
def k_cluster_wiki(input_prefix, output_prefix): k = 2000 delta = 0.001 max_iters = 10 error = float('nan') old_error = float('nan') relative_error_change = float('nan') logger.info( "Starting k-means clustering with k=%d, max iters=%d, delta=%f", k, max_iters, delta) m = ESAModel(input_prefix) similarity_index = m.similarity_index dictionary = m.dictionary num_topics = len(similarity_index) num_terms = len(dictionary) # Create initial cluster centroids. # L2-normalize them so we can calculate cosine similarity with a simple dot product. cluster_centroids = normalize(np.random.uniform(size=(k, num_terms))) # The cluster that each document belongs to. cluster_assignments = None logger.info("Preloading memory-mapped shards...") for i, shard in enumerate(similarity_index.shards): shard.get_index() iter = 0 while iter < max_iters: # Calculate cosine similarities between each centroid and each topic. # To save time, we also calculate the error for the previous assignment during this step. logger.info( "Calculating cosine similarity of each cluster with each document..." ) previous_cluster_assignments = np.copy(cluster_assignments) previous_cluster_centroids = np.copy(cluster_centroids) cluster_counts = np.ones( k) # Use ones instead of zeros to avoid divide by zero. cluster_centroids = np.zeros((k, num_terms)) previous_centroid_distances = np.zeros(k) cluster_assignments = [] docid = 0 num_shards = len(similarity_index.shards) for i, shard in enumerate(similarity_index.shards): logger.info("Processing shard %d/%d ...", i, num_shards) # Calculate a (Cluster X Document) cosine similarity matrix for the current shard. # (C X T) . (T X D) = (C X D) logger.info(" Calculating similarities...") cluster_shard_similarities = previous_cluster_centroids * shard.get_index( ).index.transpose() # Select most similar cluster for each document. logger.info(" Calculating argmax...") cluster_selections = np.argmax(cluster_shard_similarities, axis=0) cluster_assignments = np.hstack( [cluster_assignments, cluster_selections]) shard_first_docid = docid # Calculate errors for the previous assignment. # We don't calculate errors on the first iteration since we don't # have an assignment yet. if previous_cluster_assignments.size != 1: # np.copy() of None has size 1 logger.info(" Calculating error...") for doc_cluster_sims in cluster_shard_similarities.transpose(): cluster = previous_cluster_assignments[docid] previous_centroid_distances[cluster] += ( 1 - doc_cluster_sims[cluster]) docid += 1 # Iteratively recalculate the centroid of each cluster, so we don't # have to swap each shard out and back in. docid = shard_first_docid # Reset docid counter to before the error calcs. logger.info(" Computing new cluster centroids...") for topic_vec in shard.get_index().index: cluster = cluster_assignments[docid] cluster_centroids[cluster] += topic_vec cluster_counts[cluster] += 1 docid += 1 #print("Cluster assignments:", cluster_assignments) cluster_centroids /= cluster_counts[:, None] # Take the average (off by one to avoid /0) cluster_centroids = normalize(cluster_centroids) # And normalize. # We just use the sum of all cosine distances as our error metric. old_error = error error = np.sum(previous_centroid_distances) relative_error_change = abs(1 - error / old_error) logger.info( "> Iteration: %d, previous error: %f, old error: %f, rel change: %f", iter, error, old_error, relative_error_change) # TODO: Drop clusters with zero members assigned and merge clusters that # have converged to the same centroid. # Checkpoint the clusterings in every iteration so we can test them # before they converge. # Save centroids. centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k) logger.info("Saving clusters to file: %s", centroids_fname) s = MatrixSimilarity(None, dtype=np.float64, num_features=num_terms) s.index = cluster_centroids s.save(centroids_fname) del s # Free any RAM the similarity index might use. # Save assignments. assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k) logger.info("Saving cluster assignments to file: %s", assignments_fname) np.save(open(assignments_fname, 'wb'), cluster_assignments) if relative_error_change < delta: logger.info("Converged.") break iter += 1 logger.info("Done.")
fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index{}'.format( fname_suffix)))] doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) matrix_sim_loc = os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix)) if not os.path.exists(matrix_sim_loc): logger.info('building matrix similarity') doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') doc_topic.save(matrix_sim_loc) else: logger.info('matrix similarity already available. using that') doc_topic = MatrixSimilarity.load(matrix_sim_loc) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000) mbk.fit(doc_topic.index[group['original_id'],:TOPIC_LIMIT]) return mbk
def get_user_vec(user_doc): user_vec = [] for doc in user_doc: tokens = tokenizer.tokenize(doc) bow = dictionary.doc2bow(tokens) bow_tfidf = tfidf[bow] vec = ldamodel.get_document_topics(bow_tfidf) temp = np.zeros(num_topics) for topic in vec: temp[topic[0]] = topic[1] user_vec.append(temp) return user_vec user_doc = get_user_doc(review_data, info_data) user_vec = get_user_vec(user_doc) pickle_data(picklepath, "lda_inputs", user_vec) " MAKE SIMILARITY MATRIX " from gensim.similarities import MatrixSimilarity index = MatrixSimilarity(ldamodel[corpus_tfidf]) index_file = dp(os.path.join(modelpath, "lda_similarity.index")) index.save(index_file) ''' vector = prediction[:,-num_topics:][0] sims = index[vector] sims = sorted(enumerate(sims), key=lambda item: -item[1])[:10] '''
class Model: def __init__(self, filename): self.docs = loads(open(filename, "r").read()) self.docmap = hoist_dict(self.docs, "id") if isfile("data.dict"): self.dictionary = Dictionary.load("data.dict") else: self.dictionary = Dictionary(iterate_summaries(self.docs)) self.dictionary.save("data.dict") if isfile("data.mm"): self.corpus = MmCorpus("data.mm") else: corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs)) MmCorpus.serialize("data.mm", corpus) self.corpus = MmCorpus("data.mm") self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3) if isfile("data.sim"): self.sim = MatrixSimilarity.load("data.sim") else: self.sim = MatrixSimilarity(self.lsi[self.corpus]) self.sim.save("data.sim") # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1) self.sentiment_model = Doc2Vec.load("imdb.d2v") self.sentiment = LogisticRegression() self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] + [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)], asarray(list(chain(repeat(0, 12500), repeat(1, 12500))))) if isfile("arxiv.d2v"): self.doc_model = Doc2Vec.load("arxiv.d2v") else: tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs] doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) doc_model.build_vocab(tagged) shuffle(tagged) # Replace with functional stuff for epoch in range(10): doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter) doc_model.save("arxiv.d2v") def similar(self, query): """ Return all documents with similarity scores for the given query """ return [{**{"similarity": float(sim)}, **doc} for doc, sim in zip(self.docs, self.sim[self.lsi[self.dictionary.doc2bow(query.lower().split())]])] def sentiment(self, id): """ Return a sentiment score for this document. """ # TODO: Gensim import random return random.uniform(0, 1) def doc(self, id): return self.docmap.get(id) @property def positions(self): """ Return the positions for all document. Coordinates are arbitrary, but similar documents are physically close. """ vectors = [self.sentiment_model[s] for s in self.sentiment_model.wv.index2word] pca = PCA(n_components=3, whiten=True) return pca.fit(vectors).transform(vectors) @property def everything(self): """ Return all documents with coordinates and sentiment scores """ # TODO: Sentiment return [{**{"coordinates": list(pos)}, **doc} for doc, pos in zip(self.docs, self.positions)]
# gensimDictionaryFilePath = '../../5w1h-test-data/gensimDictionary.dict' # gensimCorpusFilePath = '../../5w1h-test-data/gensimCorpus.mm' # writingSimilarityIndexFilePath = '../../5w1h-test-data/similarityIndex' # for real file path gensimDictionaryFilePath = '../../5w1h-result-data/gensimDictionary.dict' gensimCorpusFilePath = '../../5w1h-result-data/gensimCorpus.mm' writingSimilarityIndexFilePath = '../../5w1h-result-data/similarityIndex' # load gensim_dictionary and gensim_corpus dictionary = corpora.Dictionary.load(gensimDictionaryFilePath) corpus = corpora.MmCorpus(gensimCorpusFilePath) # make similarityIndex similarityIndex = MatrixSimilarity(corpus, num_best=9, num_features=len(dictionary)) # save similarityIndex similarityIndex.save(writingSimilarityIndexFilePath) # # test query_keywords # queryKeywords = ['advance', 'april', 'april'] # # make doc2bow # queryBow = dictionary.doc2bow(queryKeywords) # print('queryBow', queryBow) # similarityResult = similarityIndex[queryBow] # print('similarityResult', similarityResult)
def k_cluster_wiki(input_prefix, output_prefix): k = 2000 delta = 0.001 max_iters = 10 error = float('nan') old_error = float('nan') relative_error_change = float('nan') logger.info("Starting k-means clustering with k=%d, max iters=%d, delta=%f", k, max_iters, delta) m = ESAModel(input_prefix) similarity_index = m.similarity_index dictionary = m.dictionary num_topics = len(similarity_index) num_terms = len(dictionary) # Create initial cluster centroids. # L2-normalize them so we can calculate cosine similarity with a simple dot product. cluster_centroids = normalize(np.random.uniform(size=(k, num_terms))) # The cluster that each document belongs to. cluster_assignments = None logger.info("Preloading memory-mapped shards...") for i, shard in enumerate(similarity_index.shards): shard.get_index() iter = 0 while iter < max_iters: # Calculate cosine similarities between each centroid and each topic. # To save time, we also calculate the error for the previous assignment during this step. logger.info("Calculating cosine similarity of each cluster with each document...") previous_cluster_assignments = np.copy(cluster_assignments) previous_cluster_centroids = np.copy(cluster_centroids) cluster_counts = np.ones(k) # Use ones instead of zeros to avoid divide by zero. cluster_centroids = np.zeros((k, num_terms)) previous_centroid_distances = np.zeros(k) cluster_assignments = [] docid = 0 num_shards = len(similarity_index.shards) for i, shard in enumerate(similarity_index.shards): logger.info("Processing shard %d/%d ...", i, num_shards) # Calculate a (Cluster X Document) cosine similarity matrix for the current shard. # (C X T) . (T X D) = (C X D) logger.info(" Calculating similarities...") cluster_shard_similarities = previous_cluster_centroids * shard.get_index().index.transpose() # Select most similar cluster for each document. logger.info(" Calculating argmax...") cluster_selections = np.argmax(cluster_shard_similarities, axis=0) cluster_assignments = np.hstack([cluster_assignments, cluster_selections]) shard_first_docid = docid # Calculate errors for the previous assignment. # We don't calculate errors on the first iteration since we don't # have an assignment yet. if previous_cluster_assignments.size != 1: # np.copy() of None has size 1 logger.info(" Calculating error...") for doc_cluster_sims in cluster_shard_similarities.transpose(): cluster = previous_cluster_assignments[docid] previous_centroid_distances[cluster] += (1 - doc_cluster_sims[cluster]) docid += 1 # Iteratively recalculate the centroid of each cluster, so we don't # have to swap each shard out and back in. docid = shard_first_docid # Reset docid counter to before the error calcs. logger.info(" Computing new cluster centroids...") for topic_vec in shard.get_index().index: cluster = cluster_assignments[docid] cluster_centroids[cluster] += topic_vec cluster_counts[cluster] += 1 docid += 1 #print("Cluster assignments:", cluster_assignments) cluster_centroids /= cluster_counts[:,None] # Take the average (off by one to avoid /0) cluster_centroids = normalize(cluster_centroids) # And normalize. # We just use the sum of all cosine distances as our error metric. old_error = error error = np.sum(previous_centroid_distances) relative_error_change = abs(1 - error / old_error) logger.info("> Iteration: %d, previous error: %f, old error: %f, rel change: %f", iter, error, old_error, relative_error_change) # TODO: Drop clusters with zero members assigned and merge clusters that # have converged to the same centroid. # Checkpoint the clusterings in every iteration so we can test them # before they converge. # Save centroids. centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k) logger.info("Saving clusters to file: %s", centroids_fname) s = MatrixSimilarity(None, dtype = np.float64, num_features = num_terms) s.index = cluster_centroids s.save(centroids_fname) del s # Free any RAM the similarity index might use. # Save assignments. assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k) logger.info("Saving cluster assignments to file: %s", assignments_fname) np.save(open(assignments_fname, 'wb'), cluster_assignments) if relative_error_change < delta: logger.info("Converged.") break iter += 1 logger.info("Done.")