Esempio n. 1
0
def search(corpus):
    """Search Demo function"""

    normalized_corpus = normalize_corpus(corpus)
    vectorizer, feature_matrix = build_feature_matrix(
        normalized_corpus, 'tfidf')

    q = input('Enter search query. Press "Enter" to stop: \n')

    while q != '':
        q = normalize_document(q)
        q_tfidf = vectorizer.transform([q])
        ans_mat = q_tfidf.dot(feature_matrix.transpose())
        ans_list = []
        for j in range(ans_mat.shape[1]):
            if ans_mat[0, j] > 0:
                ans_list.append((j, ans_mat[0, j]))
        ans_list.sort(key=lambda x: x[1], reverse=True)

        print()
        print('************ {} ************'.format(q))

        for item in ans_list[:5]:
            print()
            print('Document no. {}, rank: {}'.format(item[0], item[1]))
            print(corpus[item[0]][:])
            print()
            print()

        q = input('Enter search query. Press "Enter" to stop: \n')
    print()
Esempio n. 2
0
def cosine_similarity_classification(norm_corpus, norm_query_docs):

    tfidf_vectorizer, tfidf_features = build_feature_matrix(
        norm_corpus,
        feature_type='tfidf',
        ngram_range=(1, 1),
        min_df=0.0,
        max_df=1.0)
    query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

    display_features(tfidf_features, tfidf_vectorizer)

    def compute_cosine_similarity(doc_features, corpus_features, top_n=3):

        # get document vectors
        doc_features = doc_features.toarray()[0]
        corpus_features = corpus_features.toarray()

        # compute similarities
        similarity = np.dot(doc_features, corpus_features.T)

        # get docs with highest similarity scores
        top_docs = similarity.argsort()[::-1][:top_n]
        top_docs_with_score = [(index, round(similarity[index], 3))
                               for index in top_docs]

        return top_docs_with_score

    print('Document Similarity Analysis using Cosine Similarity')
    print('=' * 60)

    try:
        for index, doc in enumerate(query_post):

            doc_tfidf = query_docs_tfidf[index]
            top_similar_docs = compute_cosine_similarity(doc_tfidf,
                                                         tfidf_features,
                                                         top_n=2)
            print('Document', index + 1, ':', doc)
            print('Top', len(top_similar_docs), 'similar docs:')
            print('-' * 40)
            for doc_index, sim_score in top_similar_docs:
                # print "working"
                if sim_score > 0.2:
                    print(
                        'Doc num: {} Similarity Score: {}\nDoc: {}\nCategory: {}'
                        .format(doc_index + 1, sim_score,
                                description[doc_index], category[doc_index]))
                print('-' * 40)
            print
    except:
        print("Cosine similarity method failed")
Esempio n. 3
0
def hellinger_bhattacharya():

    tfidf_vectorizer, tfidf_features = build_feature_matrix(
        norm_corpus,
        feature_type='tfidf',
        ngram_range=(1, 1),
        min_df=0.0,
        max_df=1.0)
    query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

    display_features(tfidf_features, tfidf_vectorizer)

    def compute_hellinger_bhattacharya_distance(doc_features,
                                                corpus_features,
                                                top_n=2):
        # get document vectors
        doc_features = doc_features.toarray()[0]
        corpus_features = corpus_features.toarray()
        # compute hb distances
        distance = np.hstack(
            np.sqrt(0.5 * np.sum(
                np.square(np.sqrt(doc_features) - np.sqrt(corpus_features)),
                axis=1)))
        # get docs with lowest distance scores
        top_docs = distance.argsort()[:top_n]
        top_docs_with_score = [(index, round(distance[index], 3))
                               for index in top_docs]
        return top_docs_with_score

    print 'Document Similarity Analysis using Hellinger-Bhattacharya distance'
    print '=' * 60
    for index, doc in enumerate(query_post):

        doc_tfidf = query_docs_tfidf[index]
        top_similar_docs = compute_hellinger_bhattacharya_distance(
            doc_tfidf, tfidf_features, top_n=2)
        print 'Document', index + 1, ':', doc
        print 'Top', len(top_similar_docs), 'similar docs:'
        print '-' * 40
        for doc_index, sim_score in top_similar_docs:
            print 'Doc num: {} Similarity Score: {}\nDoc: {}\nCategory: {}'.format(
                doc_index + 1, sim_score, description[doc_index],
                category[doc_index])
            print '-' * 40
        print
Esempio n. 4
0

print_topics_gensim(topic_model=lsi,
                    total_topics=total_topics,
#                     num_terms=5,
                    display_weights=True)


# ### LSI custom built topic model (using SVD)

# In[51]:


norm_corpus = normalize_corpus(toy_corpus)

vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                feature_type='tfidf')
td_matrix = tfidf_matrix.transpose()

td_matrix = td_matrix.multiply(td_matrix > 0)

total_topics = 2
feature_names = vectorizer.get_feature_names()

u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics)
weights = u.transpose() * s[:, None]

topics = get_topics_terms_weights(weights, feature_names)

print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 weight_threshold=0.15,
Esempio n. 5
0
print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)
from feature_extractors import build_feature_matrix
import networkx
import numpy as np
import matplotlib
from normalization import normalize_corpus

norm = normalize_corpus(corpus)
# construcat weighted document term matrix
vec, dt_matrix = build_feature_matrix(norm, feature_type='tfidf')

similarity_matrix = (dt_matrix * dt_matrix.T)
# view document similarity matrix
print np.round(similarity_matrix.todense(), 2)

# build similarity graph
similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
# networkx.draw_networkx(similarity_graph)
Esempio n. 6
0
description = list(df['Description'])
category = list(df['Category'])
# #print category
problem = 'Internet nahi chal raha hai aur kutte raat ko bahut bhokte hai'
temp_post = problem.split('.')
query_post = []
for t in temp_post:
    if t == '':
        continue
    query_post.append(t)
print query_post

#NOrmalize and extract features from the corpus
norm_corpus = normalize_corpus(description, lemmatize=True)
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus, feature_type='tfidf', ngram_range=(1,1), min_df=0.0, max_df=1.0)

features = tfidf_features.todense()
feature_name = tfidf_vectorizer.get_feature_names()

def display_features(features, feature_name):
    dff = pd.DataFrame(data=features, columns=feature_name)
    print dff

# display_features(features, feature_name)
#Normalize and extract features from the query corpus
norm_query_docs = normalize_corpus(query_post, lemmatize=True)
#print norm_corpus
#print norm_query_docs