def create_similarity_matrix(name): lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') corpus = gensim.corpora.MmCorpus(name + '.mm') lda_corpus = lda[corpus] dictionary = gensim.corpora.Dictionary.load(name + '.dict') numTokens = len(dictionary.values()) index = MatrixSimilarity(lda_corpus, num_features=numTokens) index.save(name + '.sim') return
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) logger.info('loading models and dictionary') dictionary = Dictionary.load(path.join(result_path, p['model_label'], 'dic.dict')) model_path = path.join(result_path, p['model_label']) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) pre = pickle.load(open(path.join(model_path, 'pre.model'))) lsi.num_topics = p['num_topics'] logger.info('load wikipedia articles') article_path = path.join(result_path, p['article_label']) wiki = pickle.load(open(path.join(article_path, 'articles.pickle'))) times = np.zeros((1, len(wiki))) count = 0 for query_key, query in wiki.iteritems(): logger.info("working on: %s" % query_key) n = len(query) human = [val['rating'] for val in query.itervalues()] t0 = time.time() corpus = [lsi[pre[dictionary.doc2bow(val['text'])]] for val in query.itervalues()] sim_res = MatrixSimilarity(corpus)[corpus] sim_res.save(path.join(output_dir, 'sim_' + query_key)) avg = np.mean(sim_res, axis=0) idx = np.argsort(avg) times[count] = time.time() - t0 # compute correlation with human rating res = np.zeros((n, 1)) for i in range(n): human_r = [human[j] for j in idx[i:]] res[i, 0] = np.mean(human_r) # plot correlation fig = plt.figure() ax = fig.add_subplot(3, 1, 1) ax.plot(res) ax = fig.add_subplot(3, 1, 2) ratings = [val['rating'] for val in query.itervalues()] ax.scatter(avg[idx], [ratings[i] for i in idx]) # plot similarity distribution ax = fig.add_subplot(3, 1, 3) ax.bar(range(n), avg[idx]) # Set the x tick labels to the group_labels defined above and rotate ax.set_xticks(range(n)) k = [key + ' ' + str(query[key]['rating']) for key in query.keys()] ax.set_xticklabels([k[i] for i in idx]) fig.autofmt_xdate() plt.savefig(path.join(output_dir, query_key + '.' + p['format'])) plt.close() logger.info('average similarity calculation time: %f' % np.mean(times))
group_centroids = [] for cluster_no, group in top_topic_words_u_df.groupby('cluster_number'): gsum = group.ix[:, 'topic_0':'topic_199'].as_matrix().sum(axis=0) gsize = len(group) c = gsum / gsize centroid_index.append(cluster_no) group_centroids.append(c) group_centroids = np.array(group_centroids) centroid_df = pd.DataFrame(group_centroids, index=centroid_index) centroid_df.to_csv('persistence/lsi_topic-agglom_centroids.csv') cluster_centroid_matrix = centroid_df.as_matrix() logger.info('bulding similarity matrix') word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200) tfidf_corpus_lsi = np.load('persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy') word_mat_sim.num_best = 1 word_mat_sim.save('persistence/lsi_word-agglom_word-similarity-matrix') with open('persistence/tfidf-lsi_sim_word-topic-hier.csv','w') as fout: with open('stats/tfidf-lsi_sim_problems.txt', 'w') as errout: csvw = csv.writer(fout) for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]): try: csvw.writerow((doc_id, sim[0][0], sim[0][1])) except Exception as e: errout.write(str(fnames[doc_id])+'\n') logger.error(e) continue
logger.info("loading lsi model") lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, "lsi_model-200")) logger.info("globbing filenames") fnames = iglob(os.path.join(settings.PROC_DIR, "*.json")) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info("building matrix similarity") sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info("persisting matrix similarity index") sim_matrix.save(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200_matrix_similarity")) logger.info("survey of neighbor groupings") with open(os.path.join(settings.STATS_DIR, "num_neighbors.csv", "w")) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size csv_writer.writerow((doc, n_similar)) except Exception as e: logger.error(e) continue ## MiniBatch K-means
logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load( os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) logger.info('globbing filenames') fnames = iglob(os.path.join(settings.PROC_DIR, '*.json')) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info('building matrix similarity') sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') sim_matrix.save( os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) logger.info('survey of neighbor groupings') with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec( tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size csv_writer.writerow((doc, n_similar)) except Exception as e: logger.error(e) continue
group_centroids = [] for cluster_no, group in top_topic_words_u_df.groupby("cluster_number"): gsum = group.ix[:, "topic_0":"topic_199"].as_matrix().sum(axis=0) gsize = len(group) c = gsum / gsize centroid_index.append(cluster_no) group_centroids.append(c) group_centroids = np.array(group_centroids) centroid_df = pd.DataFrame(group_centroids, index=centroid_index) centroid_df.to_csv("persistence/lsi_topic-agglom_centroids.csv") cluster_centroid_matrix = centroid_df.as_matrix() logger.info("bulding similarity matrix") word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200) tfidf_corpus_lsi = np.load("persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy") word_mat_sim.num_best = 1 word_mat_sim.save("persistence/lsi_word-agglom_word-similarity-matrix") with open("persistence/tfidf-lsi_sim_word-topic-hier.csv", "w") as fout: with open("stats/tfidf-lsi_sim_problems.txt", "w") as errout: csvw = csv.writer(fout) for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]): try: csvw.writerow((doc_id, sim[0][0], sim[0][1])) except Exception as e: errout.write(str(fnames[doc_id]) + "\n") logger.error(e) continue
logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) logger.info('globbing filenames') fnames = iglob(os.path.join(settings.PROC_DIR, '*.json')) from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity logger.info('building matrix similarity') sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') sim_matrix.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi_matrix_similarity')) logger.info('survey of neighbor groupings') with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout: csv_writer = csv.writer(fout) for i, doc in fnames: try: result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))] n_similar = np.argwhere(result > 0.5).flatten().size csv_writer.writerow((doc, n_similar)) except Exception as e: logger.error(e) continue ## MiniBatch K-means