Example #1
0
def create_similarity_matrix(name):
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  corpus = gensim.corpora.MmCorpus(name + '.mm')
  lda_corpus = lda[corpus]
  dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  numTokens = len(dictionary.values())
  index = MatrixSimilarity(lda_corpus, num_features=numTokens)
  index.save(name + '.sim')
  return
Example #2
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info('loading models and dictionary')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['model_label'],
                                           'dic.dict'))
    model_path = path.join(result_path, p['model_label'])
    lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
    pre = pickle.load(open(path.join(model_path, 'pre.model')))
    lsi.num_topics = p['num_topics']

    logger.info('load wikipedia articles')
    article_path = path.join(result_path, p['article_label'])
    wiki = pickle.load(open(path.join(article_path, 'articles.pickle')))

    times = np.zeros((1, len(wiki)))
    count = 0
    for query_key, query in wiki.iteritems():
        logger.info("working on: %s" % query_key)
        n = len(query)
        human = [val['rating'] for val in query.itervalues()]

        t0 = time.time()
        corpus = [lsi[pre[dictionary.doc2bow(val['text'])]]
                    for val in query.itervalues()]
        sim_res = MatrixSimilarity(corpus)[corpus]
        sim_res.save(path.join(output_dir, 'sim_' + query_key))
        avg = np.mean(sim_res, axis=0)
        idx = np.argsort(avg)
        times[count] = time.time() - t0

        # compute correlation with human rating
        res = np.zeros((n, 1))
        for i in range(n):
            human_r = [human[j] for j in idx[i:]]
            res[i, 0] = np.mean(human_r)

        # plot correlation
        fig = plt.figure()
        ax = fig.add_subplot(3, 1, 1)
        ax.plot(res)

        ax = fig.add_subplot(3, 1, 2)
        ratings = [val['rating'] for val in query.itervalues()]
        ax.scatter(avg[idx], [ratings[i] for i in idx])

        # plot similarity distribution
        ax = fig.add_subplot(3, 1, 3)
        ax.bar(range(n), avg[idx])

        # Set the x tick labels to the group_labels defined above and rotate
        ax.set_xticks(range(n))
        k = [key + ' ' + str(query[key]['rating']) for key in query.keys()]
        ax.set_xticklabels([k[i] for i in idx])
        fig.autofmt_xdate()
        plt.savefig(path.join(output_dir, query_key + '.' + p['format']))
        plt.close()
    logger.info('average similarity calculation time: %f' % np.mean(times))
Example #3
0
group_centroids = []

for cluster_no, group in top_topic_words_u_df.groupby('cluster_number'):
    gsum = group.ix[:, 'topic_0':'topic_199'].as_matrix().sum(axis=0)
    gsize = len(group)
    c = gsum / gsize
    centroid_index.append(cluster_no)
    group_centroids.append(c)

group_centroids = np.array(group_centroids)
centroid_df = pd.DataFrame(group_centroids, index=centroid_index)
centroid_df.to_csv('persistence/lsi_topic-agglom_centroids.csv')
cluster_centroid_matrix = centroid_df.as_matrix()

logger.info('bulding similarity matrix')
word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200)

tfidf_corpus_lsi = np.load('persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy')
word_mat_sim.num_best = 1
word_mat_sim.save('persistence/lsi_word-agglom_word-similarity-matrix')
with open('persistence/tfidf-lsi_sim_word-topic-hier.csv','w') as fout:
    with open('stats/tfidf-lsi_sim_problems.txt', 'w') as errout:
        csvw = csv.writer(fout)
        for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]):
            try:
                csvw.writerow((doc_id, sim[0][0], sim[0][1]))
            except Exception as e:
                errout.write(str(fnames[doc_id])+'\n')
                logger.error(e)
                continue
logger.info("loading lsi model")
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, "lsi_model-200"))


logger.info("globbing filenames")
fnames = iglob(os.path.join(settings.PROC_DIR, "*.json"))


from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info("building matrix similarity")
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

logger.info("persisting matrix similarity index")
sim_matrix.save(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200_matrix_similarity"))

logger.info("survey of neighbor groupings")
with open(os.path.join(settings.STATS_DIR, "num_neighbors.csv", "w")) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
            csv_writer.writerow((doc, n_similar))
        except Exception as e:
            logger.error(e)
            continue

## MiniBatch K-means
logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(
    os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))

logger.info('globbing filenames')
fnames = iglob(os.path.join(settings.PROC_DIR, '*.json'))

from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info('building matrix similarity')
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi,
                              num_features=tfidf_corpus_lsi.num_terms)

logger.info('persisting matrix similarity index')
sim_matrix.save(
    os.path.join(settings.PERSIST_DIR,
                 'tfidf_corpus_lsi-200_matrix_similarity'))

logger.info('survey of neighbor groupings')
with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(
                tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
            csv_writer.writerow((doc, n_similar))
        except Exception as e:
            logger.error(e)
            continue
group_centroids = []

for cluster_no, group in top_topic_words_u_df.groupby("cluster_number"):
    gsum = group.ix[:, "topic_0":"topic_199"].as_matrix().sum(axis=0)
    gsize = len(group)
    c = gsum / gsize
    centroid_index.append(cluster_no)
    group_centroids.append(c)

group_centroids = np.array(group_centroids)
centroid_df = pd.DataFrame(group_centroids, index=centroid_index)
centroid_df.to_csv("persistence/lsi_topic-agglom_centroids.csv")
cluster_centroid_matrix = centroid_df.as_matrix()

logger.info("bulding similarity matrix")
word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200)

tfidf_corpus_lsi = np.load("persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy")
word_mat_sim.num_best = 1
word_mat_sim.save("persistence/lsi_word-agglom_word-similarity-matrix")
with open("persistence/tfidf-lsi_sim_word-topic-hier.csv", "w") as fout:
    with open("stats/tfidf-lsi_sim_problems.txt", "w") as errout:
        csvw = csv.writer(fout)
        for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]):
            try:
                csvw.writerow((doc_id, sim[0][0], sim[0][1]))
            except Exception as e:
                errout.write(str(fnames[doc_id]) + "\n")
                logger.error(e)
                continue
logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))


logger.info('globbing filenames')
fnames = iglob(os.path.join(settings.PROC_DIR, '*.json'))


from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info('building matrix similarity')
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

logger.info('persisting matrix similarity index')
sim_matrix.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi_matrix_similarity'))

logger.info('survey of neighbor groupings')
with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
            csv_writer.writerow((doc, n_similar))
        except Exception as e:
            logger.error(e)
            continue

## MiniBatch K-means