Esempio n. 1
0
def create_similarity_matrix(name):
  lda = gensim.models.ldamodel.LdaModel.load(name + '.lda')
  corpus = gensim.corpora.MmCorpus(name + '.mm')
  lda_corpus = lda[corpus]
  dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  numTokens = len(dictionary.values())
  index = MatrixSimilarity(lda_corpus, num_features=numTokens)
  index.save(name + '.sim')
  return
Esempio n. 2
0
    def __init__(self,
                 links,
                 stopwords=True,
                 num_topics=40,
                 num_clusters=40,
                 **kwargs):
        from gensim.models import TfidfModel
        from gensim.similarities.docsim import MatrixSimilarity

        if 'n_below' not in kwargs:
            kwargs['n_below'] = 5

        if 'n_above' not in kwargs:
            kwargs['n_above'] = 0.7

        if 'iterations' not in kwargs:
            kwargs['iterations'] = 200

        self.meta = _compute_meta_dataframe(links)
        self.lexicon, self.bow = _compute_lex_bow(self.meta,
                                                  stopwords=stopwords,
                                                  no_below=kwargs['n_below'],
                                                  no_above=kwargs['n_above'])
        self.tfidf = TfidfModel(self.bow)
        self.matsim = MatrixSimilarity(self.bow,
                                       num_features=len(self.lexicon))
        self.lda = _compute_lda(self.bow,
                                self.lexicon,
                                num_topics=num_topics,
                                iterations=kwargs['iterations'])
        self.clust = _compute_spectral_clust(self.similarity_matrix(),
                                             num_clusters=num_clusters)
Esempio n. 3
0
def compute_documents_similarity(target, name):
  dictionary = gensim.corpora.Dictionary.load(name + '.dict')
  index = MatrixSimilarity.load(name + '.sim')
  print index
  sims = index[target]
  sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
  top_documents = sort_sims[:200]
  return map(lambda item: item[0], top_documents)
Esempio n. 4
0
def predict_movies(input_list,
                   corpus_path='data/corpus.txt',
                   dic_path='data/dic.dict'):
    f = open(corpus_path, "rb")
    corpus = pickle.load(f)
    dic = Dictionary.load(dic_path)
    dic.add_documents([input_list])
    corpus.append(dic.doc2bow(input_list))
    lsi = LsiModel(corpus, num_topics=200, id2word=dic)
    vectorized_corpus = lsi[corpus]
    doc_index = MatrixSimilarity(vectorized_corpus)
    sims = doc_index[vectorized_corpus]
    return sims[-1:][0][:-1]
Esempio n. 5
0
def get_proposals_auto_grouped(topics_count=100, threshold=.5):
    ids, words = _get_raw_docs()

    dictionary = corpora.Dictionary(words)
    corpus = [dictionary.doc2bow(x) for x in words]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf,
                          id2word=dictionary,
                          num_topics=topics_count)
    lsi_corpus = lsi[corpus_tfidf]

    ms = MatrixSimilarity(lsi_corpus)

    neighbors = {}
    for frm, row in zip(ids, lsi_corpus):
        neighbors[frm] = [
            ids[n] for n, match in enumerate(ms[row])
            if match > threshold and ids[n] != frm
        ]

    results = []
    groups = {}
    for root, children in neighbors.items():
        target = groups.get(root, None)
        if not target:
            target = set()
            results.append(target)
        target.add(root)
        target.update(children)
        for c in children:
            groups[c] = target

    rv = {}
    for n, row in enumerate(results):
        for x in row:
            rv[x] = n

    return rv
Esempio n. 6
0
def get_proposals_auto_grouped(topics_count=20, cutoff=0.75):
    doc_words, ids, titles = _get_raw_docs()

    dictionary = corpora.Dictionary(doc_words)
    corpus = [dictionary.doc2bow(x) for x in doc_words]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf,
                          id2word=dictionary,
                          num_topics=topics_count)
    lsi_corpus = lsi[corpus_tfidf]
    ms = MatrixSimilarity(lsi_corpus)

    neighborhoods = []
    seen = set()
    for n in range(len(lsi_corpus)):
        if n in seen:
            continue
        near = neighbors(n, ms, lsi_corpus, cutoff)
        neighborhoods.append({
            'talks': [{
                'id': ids[x],
                'title': titles[x],
                'row': x
            } for x in near]
        })
        seen.update(near)

    for group in neighborhoods:
        rows = [x['row'] for x in group['talks']]
        #Horrible way to get closest topic, but just looking for a hint.
        closest_topic = sorted(lsi[lsi_corpus[rows[0]]],
                               key=lambda x: x[-1])[0][0]
        topic = sorted(lsi.show_topic(closest_topic), key=lambda x: -x[-1])
        group['topic'] = ', '.join('{} ({:.2f})'.format(x, score)
                                   for x, score in topic)

    return neighborhoods
Esempio n. 7
0
    texts = [[
        word for word in document.lower().split() if word not in WORD_BLACKLIST
    ] for document in lines]
    #print("{}".format(texts))

    #ESERCIZIO 2
    # Costruire il lessico (gensim.corpora.Dictionary), scartando le parole con frequenza 1

    lessico = corpora.Dictionary(texts)
    lessico.filter_n_most_frequent(1)
    print(lessico)

    #ESERCIZIO 3
    # Rappresentare i documenti come vettori (gensim.corpora.Dictionary.doc2bow)
    documents = [document.lower().split() for document in lines]
    vector_documents = [lessico.doc2bow(document) for document in documents]
    print(vector_documents)

    #ESERCIZIO 4
    # Calcolare la similarità tra (il vettore di) un documento qualsiasi e tutti gli altri (gensim.similarities.MatrixSimilarity)
    S = MatrixSimilarity(vector_documents)
    print("{}".format(S[lessico.doc2bow(documents[3])]))

    #ESERCIZIO 5
    # Costruire una funzione che, dato in input (il vettore di) un documento qualsiasi, restituisca gli n=5 documenti della
    # collezione a lui più simili ordinati per punteggio di similarità in modo decrescente
    def mostSimilar(S, lessico, document, n=5):
        similars = S[lessico.doc2bow(document)]
        return sorted(similars)[1:n]

    print("{}".format(mostSimilar(S, lessico, documents[1])))
Esempio n. 8
0
         if word not in ENGLISH_STOP_WORDS
     ] for document in pieces]
     # remove words that appear only once
     frequency = Counter([token for text in texts for token in text])
     texts = [[token for token in text if frequency[token] > 1]
              for text in texts]
     dictionary = corpora.Dictionary(texts)
     logger.info('dictionary size: {}'.format(len(dictionary)))
     corpus_ = [dictionary.doc2bow(text) for text in texts]
     lsi = models.LsiModel(corpus_,
                           id2word=dictionary,
                           num_topics=lsi_topic_count)
     lsi.show_topics(num_topics=lsi_topic_count,
                     num_words=100,
                     log=True)
     matrix_similarity = MatrixSimilarity(
         lsi[corpus_], num_features=similarity_feature_count)
 elif mode in {modes[2], modes[3]}:
     texts = [[
         word for word in tokenize_by_word(document.lower())
         if word not in ENGLISH_STOP_WORDS
     ] for document in pieces]
     # remove words that appear only once
     frequency = Counter([token for text in texts for token in text])
     texts = [[token for token in text if frequency[token] > 1]
              for text in texts]
     documents = [
         TaggedDocument(doc, [i]) for i, doc in enumerate(texts)
     ]
     doc2vec_model = Doc2Vec(
         documents,
         epochs=doc2vec_epochs,
Esempio n. 9
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    logger.info('loading models and dictionary')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['model_label'],
                                           'dic.dict'))
    model_path = path.join(result_path, p['model_label'])
    lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
    pre = pickle.load(open(path.join(model_path, 'pre.model')))
    lsi.num_topics = p['num_topics']

    logger.info('load wikipedia articles')
    article_path = path.join(result_path, p['article_label'])
    wiki = pickle.load(open(path.join(article_path, 'articles.pickle')))

    times = np.zeros((1, len(wiki)))
    count = 0
    for query_key, query in wiki.iteritems():
        logger.info("working on: %s" % query_key)
        n = len(query)
        human = [val['rating'] for val in query.itervalues()]

        t0 = time.time()
        corpus = [lsi[pre[dictionary.doc2bow(val['text'])]]
                    for val in query.itervalues()]
        sim_res = MatrixSimilarity(corpus)[corpus]
        sim_res.save(path.join(output_dir, 'sim_' + query_key))
        avg = np.mean(sim_res, axis=0)
        idx = np.argsort(avg)
        times[count] = time.time() - t0

        # compute correlation with human rating
        res = np.zeros((n, 1))
        for i in range(n):
            human_r = [human[j] for j in idx[i:]]
            res[i, 0] = np.mean(human_r)

        # plot correlation
        fig = plt.figure()
        ax = fig.add_subplot(3, 1, 1)
        ax.plot(res)

        ax = fig.add_subplot(3, 1, 2)
        ratings = [val['rating'] for val in query.itervalues()]
        ax.scatter(avg[idx], [ratings[i] for i in idx])

        # plot similarity distribution
        ax = fig.add_subplot(3, 1, 3)
        ax.bar(range(n), avg[idx])

        # Set the x tick labels to the group_labels defined above and rotate
        ax.set_xticks(range(n))
        k = [key + ' ' + str(query[key]['rating']) for key in query.keys()]
        ax.set_xticklabels([k[i] for i in idx])
        fig.autofmt_xdate()
        plt.savefig(path.join(output_dir, query_key + '.' + p['format']))
        plt.close()
    logger.info('average similarity calculation time: %f' % np.mean(times))
Esempio n. 10
0
group_centroids = []

for cluster_no, group in top_topic_words_u_df.groupby('cluster_number'):
    gsum = group.ix[:, 'topic_0':'topic_199'].as_matrix().sum(axis=0)
    gsize = len(group)
    c = gsum / gsize
    centroid_index.append(cluster_no)
    group_centroids.append(c)

group_centroids = np.array(group_centroids)
centroid_df = pd.DataFrame(group_centroids, index=centroid_index)
centroid_df.to_csv('persistence/lsi_topic-agglom_centroids.csv')
cluster_centroid_matrix = centroid_df.as_matrix()

logger.info('bulding similarity matrix')
word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200)

tfidf_corpus_lsi = np.load('persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy')
word_mat_sim.num_best = 1
word_mat_sim.save('persistence/lsi_word-agglom_word-similarity-matrix')
with open('persistence/tfidf-lsi_sim_word-topic-hier.csv','w') as fout:
    with open('stats/tfidf-lsi_sim_problems.txt', 'w') as errout:
        csvw = csv.writer(fout)
        for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]):
            try:
                csvw.writerow((doc_id, sim[0][0], sim[0][1]))
            except Exception as e:
                errout.write(str(fnames[doc_id])+'\n')
                logger.error(e)
                continue
logger.info("deserializing tfidf_corpus_lsi")
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200"))

logger.info("loading lsi model")
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, "lsi_model-200"))


logger.info("globbing filenames")
fnames = iglob(os.path.join(settings.PROC_DIR, "*.json"))


from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info("building matrix similarity")
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

logger.info("persisting matrix similarity index")
sim_matrix.save(os.path.join(settings.PERSIST_DIR, "tfidf_corpus_lsi-200_matrix_similarity"))

logger.info("survey of neighbor groupings")
with open(os.path.join(settings.STATS_DIR, "num_neighbors.csv", "w")) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
            csv_writer.writerow((doc, n_similar))
        except Exception as e:
            logger.error(e)
Esempio n. 12
0
    
    series = pd.Series(similarities_to_group)
    series.index.names = ['docx','docy']
    return series

def get_tfidf_similarities(doc_index_series):
    tfidfs_from = reviews_tfidf_docs[doc_index_series.name]
    similarities_to = similarity_indices[tfidfs_from]
    return _filter_similarities(doc_index_series,similarities_to)


# In[47]:


#similarity indices for each doc
similarity_indices = MatrixSimilarity(reviews_tfidf_docs)


# In[48]:


#get index:name mappings
doc_mapping = (
    review_df
    .groupby('business_id')
    ['name']
    .apply(lambda x: x.unique()[0])
    .to_frame()
    .assign(doc_index=range(701))
)
logger.info('deserializing tfidf_corpus_lsi')
tfidf_corpus_lsi = corpora.MmCorpus(
    os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200'))

logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(
    os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))

logger.info('globbing filenames')
fnames = iglob(os.path.join(settings.PROC_DIR, '*.json'))

from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info('building matrix similarity')
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi,
                              num_features=tfidf_corpus_lsi.num_terms)

logger.info('persisting matrix similarity index')
sim_matrix.save(
    os.path.join(settings.PERSIST_DIR,
                 'tfidf_corpus_lsi-200_matrix_similarity'))

logger.info('survey of neighbor groupings')
with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(
                tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
group_centroids = []

for cluster_no, group in top_topic_words_u_df.groupby("cluster_number"):
    gsum = group.ix[:, "topic_0":"topic_199"].as_matrix().sum(axis=0)
    gsize = len(group)
    c = gsum / gsize
    centroid_index.append(cluster_no)
    group_centroids.append(c)

group_centroids = np.array(group_centroids)
centroid_df = pd.DataFrame(group_centroids, index=centroid_index)
centroid_df.to_csv("persistence/lsi_topic-agglom_centroids.csv")
cluster_centroid_matrix = centroid_df.as_matrix()

logger.info("bulding similarity matrix")
word_mat_sim = MatrixSimilarity(cluster_centroid_matrix, num_features=200)

tfidf_corpus_lsi = np.load("persistence/tfidf_corpus_lsi-200_matrix_similarity.index.npy")
word_mat_sim.num_best = 1
word_mat_sim.save("persistence/lsi_word-agglom_word-similarity-matrix")
with open("persistence/tfidf-lsi_sim_word-topic-hier.csv", "w") as fout:
    with open("stats/tfidf-lsi_sim_problems.txt", "w") as errout:
        csvw = csv.writer(fout)
        for doc_id, sim in enumerate(word_mat_sim[tfidf_corpus_lsi]):
            try:
                csvw.writerow((doc_id, sim[0][0], sim[0][1]))
            except Exception as e:
                errout.write(str(fnames[doc_id]) + "\n")
                logger.error(e)
                continue
logger.info('deserializing tfidf_corpus_lsi')
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200'))

logger.info('loading lsi model')
lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200'))


logger.info('globbing filenames')
fnames = iglob(os.path.join(settings.PROC_DIR, '*.json'))


from gensim.similarities.docsim import MatrixSimilarity, SparseMatrixSimilarity, Similarity

logger.info('building matrix similarity')
sim_matrix = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms)

logger.info('persisting matrix similarity index')
sim_matrix.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi_matrix_similarity'))

logger.info('survey of neighbor groupings')
with open(os.path.join(settings.STATS_DIR, 'num_neighbors.csv', 'w')) as fout:
    csv_writer = csv.writer(fout)

    for i, doc in fnames:
        try:
            result = sim_matrix[matutils.unitvec(tfidf_corpus_lsi.docbyoffset(tfidf_corpus_lsi.sim_matrix[i]))]
            n_similar = np.argwhere(result > 0.5).flatten().size
            csv_writer.writerow((doc, n_similar))
        except Exception as e:
            logger.error(e)