Beispiel #1
0
 def cos_similarity(self, first, second):
     v1 = self.get_vector(first)
     v2 = self.get_vector(second)
     if v1 is not None and v2 is not None:
         return 1 - cos_distance(v1.A, v2.A)
     else:
         return None
Beispiel #2
0
def get_best_doc_num(query, sentences):
    #get the most likely documents according to the query
    min = 1
    index = 0
    for i in range(sentences.shape[0]):
        dist = cos_distance(query, sentences[i])
        if dist < min:
            min = dist
            index = i
    return index
Beispiel #3
0
def cos_similarity(ref_counts, gen_counts):
    """
    Computes cosine similarity between
     dictionaries of form {name: count}. Non-present
     elements are considered zero:

     sim = <r, g> / ||r|| / ||g||
    """
    if len(ref_counts) == 0 or len(gen_counts) == 0:
        return np.nan
    keys = np.unique(list(ref_counts.keys()) + list(gen_counts.keys()))
    ref_vec = np.array([ref_counts.get(k, 0) for k in keys])
    gen_vec = np.array([gen_counts.get(k, 0) for k in keys])
    return 1 - cos_distance(ref_vec, gen_vec)
Beispiel #4
0
def scoreDocuments(query, matrix, reference):
    results = []  #(reference, score)

    i = 0

    for column in matrix:
        # print query.shape
        # print matrix[i].shape
        # print matrix[i]

        score = cos_distance(query, matrix[i])
        name = reference[i]
        results.append([name, score])
        i = i + 1
        # if i%100 == 0:
        #     print matrix[i]
        #     print score
    #print results
    #results.sort(key=lambda x: x[1])
    return results
Beispiel #5
0
    texts.append(get_BOW(paras))

# create words-paragraph frequency matrix
vectorizer = DictVectorizer()
brownMatrix = vectorizer.fit_transform(texts).transpose()

# get dense vectors of length 500 using truncated SVD
svd = TruncatedSVD(n_components=500)
brownMatrixSVD = svd.fit_transform(brownMatrix)

# create dictionary of wordpair/cosine-similarity mappings using LSA method for filtered word pairs
cosineSimilarityDict = {}
for word1, word2 in wordSimDict:
    word1Index = vectorizer.feature_names_.index(word1)
    word2Index = vectorizer.feature_names_.index(word2)
    cosSim = 1 - cos_distance(brownMatrixSVD[word1Index, :],
                              brownMatrixSVD[word2Index, :])
    cosineSimilarityDict[word1, word2] = cosSim

print(cosineSimilarityDict)

# create dictionary of wordpair/word2vec-similarity mappings for filtered word pairs using sentences from brown corpus
brownSentences = nltk.corpus.brown.sents()
model = Word2Vec(brownSentences, min_count=5, size=500, iter=50)
word2vecSimilarityDict = {}
for word1, word2 in wordSimDict:
    word2vecSimilarityDict[word1, word2] = model.wv.similarity(word1, word2)

print(word2vecSimilarityDict)

# compare similarities with the gold standard using pearson correlation co-efficient
wordSimGoldStanardList = list(wordSimDict.values())
Beispiel #6
0
def cosine_distance(x, y):
    '''Calculates the cosine distance between x and y.'''
    return cos_distance(x, y)