Python cosine Examples, sklearn.metrics.pairwise.cosine Python Examples

Example #1

0

Show file

    def sim_bow(self, pred, pred_lens, ref, ref_lens):
        """
        :param pred - ndarray [batch_size x seqlen]
        :param pred_lens - list of integers
        :param ref - ndarray [batch_size x seqlen]
        """
        # look up word embeddings for prediction and reference
        emb_pred = self.embedding(pred)  # [batch_sz x seqlen1 x emb_sz]
        emb_ref = self.embedding(ref)  # [batch_sz x seqlen2 x emb_sz]

        ext_emb_pred = self.extrema(emb_pred, pred_lens)
        ext_emb_ref = self.extrema(emb_ref, ref_lens)
        bow_extrema = cosine(ext_emb_pred,
                             ext_emb_ref)  # [batch_sz_pred x batch_sz_ref]

        avg_emb_pred = self.mean(emb_pred,
                                 pred_lens)  # Calculate mean over seq
        avg_emb_ref = self.mean(emb_ref, ref_lens)
        bow_avg = cosine(avg_emb_pred,
                         avg_emb_ref)  # [batch_sz_pred x batch_sz_ref]

        batch_pred, seqlen_pred, emb_size = emb_pred.shape
        batch_ref, seqlen_ref, emb_size = emb_ref.shape
        cos_sim = cosine(emb_pred.reshape(
            (-1, emb_size)), emb_ref.reshape(
                (-1, emb_size)))  # [(batch_sz*seqlen1)x(batch_sz*seqlen2)]
        cos_sim = cos_sim.reshape(
            (batch_pred, seqlen_pred, batch_ref, seqlen_ref))
        # Find words with max cosine similarity
        max12 = cos_sim.max(1).mean(2)  # max over seqlen_pred
        max21 = cos_sim.max(3).mean(1)  # max over seqlen_ref
        bow_greedy = (max12 + max21) / 2  # [batch_pred x batch_ref(1)]
        return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy)

Example #2

0

Show file

File: hNMF.py Project: zivepstein/math-thesis

def build_wgraph(alpha=2):
    if alpha != 2:
        return [[
            int(cosine(H[i], H[j])[0][0] > alpha) for i in range(0, len(H))
        ] for j in range(0, len(H))]
    else:
        return [[cosine(H[i], H[j])[0][0] for i in range(0, len(H))]
                for j in range(0, len(H))]

Example #3

0

Show file

def parallelSimilarity(paramList):
    protein_embedding_dataframe = representation_dataframe
    i = paramList[0]
    j = paramList[1]
    aspect = paramList[2]
    if j > i:
        protein1 = proteinListNew[i]
        protein2 = proteinListNew[j]
        if protein1 in protein_names and protein2 in protein_names:
            prot1vec = np.asarray(
                protein_embedding_dataframe.query("Entry == @protein1")
                ['Vector'].item())
            prot2vec = np.asarray(
                protein_embedding_dataframe.query("Entry == @protein2")
                ['Vector'].item())
            #cosine will return in shape of input vectors first dimension
            cos = cosine(prot1vec.reshape(1, -1), prot2vec.reshape(1,
                                                                   -1)).item()
            manhattanDist = cdist(prot1vec.reshape(1, -1),
                                  prot2vec.reshape(1, -1), 'cityblock')
            manhattanDistNorm = manhattanDist / (norm(prot1vec, 1) +
                                                 norm(prot2vec, 1))
            euclideanDist = cdist(prot1vec.reshape(1, -1),
                                  prot2vec.reshape(1, -1), 'euclidean')
            euclideanDistNorm = euclideanDist / (norm(prot1vec, 2) +
                                                 norm(prot2vec, 2))
            real = paramList[3]
            # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled
            similarity_list.append((real, cos, 1 - manhattanDistNorm.item(),
                                    1 - euclideanDistNorm.item()))
    return similarity_list

Example #4

0

Show file

File: infer.py Project: jeffin07/image_similarity

def infer_images():
    '''
	This function uses the trained model and predict the similarity
	between two test samples
	'''
    trained_model = load_model('custom_model.h5')
    ''' 
	created a keras model instance using our desired input and output
	Mainly because of usability
	'''
    similarity_model = Model(inputs=trained_model.input,
                             outputs=trained_model.get_layer(
                                 trained_model.layers[-3].name).output)
    '''
	loaded the dataset
	'''
    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    '''
	Reshape inputs 
	'''
    one = similarity_model.predict(x_test[0].reshape((-1, 28, 28, 1)))
    two = similarity_model.predict(x_test[1].reshape((-1, 28, 28, 1)))
    sim = cosine(one, two)
    print("Similarity between the above : {} ".format(round(sim.item(0), 2)))
    return round(sim.item(0), 2)

Example #5

0

Show file

def instance_predict(instance, collection, labels, vectors, vect, doc_term_mtx, ks):
    
    """ 
    - predicts the label of a new instance ('instance') with a Knn approach w.r.t. a collection ('collection') in terms of both the WMD and cosine similarity with TFIDF vectors for various values of K (stored in the 'ks' list) 
    - returns a dictionary with two keys (names of the two methods) and lists of length len(ks) as values. Each list contains the predictions of the method for each value of K	
	"""
    
    #### COMPUTE COSINE SIMILARITY WITH TFIDF VECTORS ####	
    	
    # get tfidf vector of the instance (using vocab from training set only - hence the 'transform' only (no fit))
    instance_tfidf = vect.transform([instance[0]])
    # compute cosine similarity between new instance and all elements in the collection
    sims = cosine(doc_term_mtx, Y=instance_tfidf, dense_output=True).tolist()
    sims = [elt[0] for elt in sims]
    # get indexes of elements sorted by DECREASING order (the greater the better for cosine sim) and store them in 'idx_st_cos'
    #### your code here ####
    
    #### COMPUTE WMD ####
    
    # wmdistance accepts lists of tokens (2nd entry of each tuple)
    dists = [vectors.wmdistance(instance[1],tuple[1]) for tuple in collection]
    # get indexes of elements sorted by INCREASING order (the smaller the better for the WMD) and store them in 'idx_st_wmd'
    #### your code here ####	
    
    #### GENERATE PREDICTIONS ####
    
    predictions = {}
    predictions['tfidf'] = #### your code here #### use the 'majority_voting' function
    predictions['wmd'] = #### your code here #### use the 'majority_voting' function
        
    return predictions

Example #6

0

Show file

File: MLM_LS.py Project: linhtr/ThesisProject_MedCon_LS

def compute_context_sis_score(source_word, sis_context, substitution_selection, fasttext_dico, fasttext_emb):
    context_sis = []

    word_context = []

    for con in sis_context:
        if con == source_word or (con not in fasttext_dico):
            continue

        word_context.append(con)

    if len(word_context) != 0:
        for sub in substitution_selection:
            sub_emb = fasttext_emb[fasttext_dico.index(sub)].reshape(1, -1)
            all_sis = 0
            for con in word_context:
                token_index_fast = fasttext_dico.index(con)
                all_sis += cosine(sub_emb, fasttext_emb[token_index_fast].reshape(1, -1))

            context_sis.append(all_sis / len(word_context))
    else:
        for i in range(len(substitution_selection)):
            context_sis.append(len(substitution_selection) - i)

    return context_sis

Example #7

0

Show file

def my_cos_similarity(word1, word2, wv):
    ## fill the gap ## hint: use the functions my_vector_getter and cosine
    sim = cosine(
        my_vector_getter(word1, wv),
        my_vector_getter(word2, wv)
    )
    return round(float(sim),4)

Example #8

0

Show file

def similarity(x, y):
    k = split_coord(x)
    rt = []
    for i in k:
        rt.append(
            cosine(np.array(i).reshape(1, -1),
                   np.array(y).reshape(1, -1))[0][0])
    return rt

Example #9

0

Show file

File: similarities.py Project: geovanicelebrim/RecSys

def cosseno(matriz):
	mu = mediaMatriz(matriz)
	copia = full(matriz.shape, mu) 
	linhas, colunas = matriz.nonzero() 

	for l, c in zip(linhas, colunas):
		copia[l, c] = matriz[l, c]

	return cosine(copia)

Example #10

0

Show file

def compute_similarities(vector, corpus, top_n=10):
    """Given an embedding and a corpus, returns the closest k embeddings."""
    similarities = {
        k: (cosine(vector.reshape(1, -1), v.reshape(1, -1)))[0, 0]
        for k, v in corpus.items()
    }
    similarities = dict(
        sorted(similarities.items(), key=itemgetter(1), reverse=True)[:top_n])
    return similarities

Example #11

0

Show file

def main():
    # 加载数据集
    dataset = dataloader()
    dicl = list(dataset.keys())
    for dic in dicl:
        print(dic, ': ', dataset[dic].shape)

    test_x = dataset[dicl[1]]  # (2933, 1024)
    test_y = dataset[dicl[2]]  # (2933, 1)

    x_train, x_test, y_train, y_test = train_test_split(test_x,
                                                        test_y,
                                                        test_size=0.3)

    # preprocessing
    # stadard
    scaler = preprocessing.StandardScaler().fit(dataset[dicl[-1]])
    train_att = scaler.transform(dataset[dicl[-1]])
    scaler = preprocessing.StandardScaler().fit(dataset[dicl[0]])
    test_att = scaler.transform(dataset[dicl[0]])

    # MinMaxScaler
    #  train_att = preprocessing.MinMaxScaler().fit_transform(dataset[dicl[-1]])
    #  test_att = preprocessing.MinMaxScaler().fit_transform(dataset[dicl[0]])

    # Nomalization
    #  train_att = preprocessing.normalize(dataset[dicl[-1]], norm="l1")
    #  test_att = preprocessing.normalize(dataset[dicl[0]], norm="l1")
    #  train_att = preprocessing.normalize(dataset[dicl[-1]], norm="l2")
    #  test_att = preprocessing.normalize(dataset[dicl[0]], norm="l2")

    # cumpute W
    W = sae(dataset[dicl[-3]].T, train_att.T, lambda_)
    #  print(W.shape)

    # reconstruct s
    s_ = s2f(W, dataset[dicl[1]])
    print(s_.shape)

    # compute consine similarity between s_ and test_att
    dist = cosine(s_.T, test_att)
    #  print(dist.shape)

    # get the index of the most similarity label
    y_ = np.argmax(dist, axis=1)
    #  print(y_.shape)
    #  print(y_)

    # cumpute the accuracy of testSet
    print("The accuracy is : ", (np.equal(dataset[dicl[2]],
                                          dataset[dicl[-2]][y_])).mean())

    x_train = s2f(W, test_x)
    score = nlpway(x_train.T, test_y)
    print('Score : ', score)

Example #12

0

Show file

def build_weight(rate, adj, user_to_idx, idx_to_user, ratings):
    '''Do some tests, and compute cosine weights'''
    print("user_to_idx[1] = ", user_to_idx[1])
    print("idx_to_user[0] = ", idx_to_user[0])
    print("Rating list of user 1, rate[1] = ", rate[user_to_idx[1]])
    print("Rating list of user 18157, rate[user_to_idx[18157]] = ",
          rate[user_to_idx[18157]])
    print("Rating list of user 48524, rate[user_to_idx[48524]] = ",
          rate[user_to_idx[48524]])
    rate_set = [0 for i in range(len(rate))]
    for i in rate:
        rate_set[i] = set(rate[i])
    print("Rating set of user 1, rate[user_to_idx[0]] = ",
          rate_set[user_to_idx[1]])
    print("Rating set of user 18157, rate[user_to_idx[18157]] = ",
          rate_set[user_to_idx[18157]])
    print("Rating set of user 48524, rate[user_to_idx[48524]] = ",
          rate_set[user_to_idx[48524]])
    weight = [{} for _ in range(len(adj))]
    for u in range(len(adj)):
        if u % 1000 == 0:
            print("User {0}/{1}".format(u, len(adj)))
        for v in adj[u]:
            if v < u:
                continue
            mutual_set = rate_set[u].intersection(rate_set[v])
            vector_u, vector_v = np.zeros(len(mutual_set)), np.zeros(
                len(mutual_set))
            for i, ele in enumerate(mutual_set):
                # print(u, v, ele)
                vector_u[i] = ratings[(u, ele)]
                vector_v[i] = ratings[(v, ele)]
            weight[u][v] = cosine(vector_u, vector_v)
            weight[v][u] = weight[u][v]
            if u == 1788 and v == 6897 or u == 1162 and v == 37593:
                print(u, v, vector_u, vector_v)
            # if len(mutual_set) > 10 and random.random() < 0.0001:
            #     print(u, v, idx_to_user[u], idx_to_user[v], mutual_set)
    print(adj[1788][6897])
    print(weight[6897][1788])
    #save weight matrix to file, to save computation time (~8mins using NumPy cosine, ~20mins using scipy cosine)
    with open("weight.txt", "w+") as f:
        for i in range(len(adj)):
            print("User {0}/ {1}".format(i, len(adj)))
            for v in adj[i]:
                if v < i:
                    f.writelines(
                        str(i) + " " + str(v) + " " + str(weight[i][v]) + "\n")
        f.close()
    return weight

Example #13

0

Show file

File: get_tfidf_scores.py Project: pmantica1/NLP

def compute_scores_and_similarities(dataset):
    score_list = []
    similarity_list = []
    for i in tqdm(range(len(dataset))):
        query_pair = dataset[i]
        question1_vec = (query_pair["id_1_title_vec"] +
                         query_pair["id_1_body_vec"]) / 2.0
        question2_vec = (query_pair["id_2_title_vec"] +
                         query_pair["id_2_body_vec"]) / 2.0
        score = (1 - cosine(question1_vec.numpy(), question2_vec.numpy()))[0]
        similarity = query_pair["similarity"]
        score_list.append(score)
        similarity_list.append(similarity)
    return score_list, similarity_list

Example #14

0

Show file

File: MLM_LS.py Project: linhtr/ThesisProject_MedCon_LS

def preprocess_SR(source_word, substitution_selection, fasttext_dico, fasttext_emb, word_count):
    ss = []
    # ss_score=[]
    sis_scores=[]
    count_scores=[]
    # source_count = 10
    # if source_word in word_count:
    #     source_count = word_count[source_word]

    isFast = True

    if (source_word not in fasttext_dico):
        isFast = False
    else:
        source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape(1,-1)

    if isFast == False and source_word.lower() in fasttext_dico:
        isFast = True
        source_emb = fasttext_emb[fasttext_dico.index(source_word.lower())].reshape(1,-1)

    # ss.append(source_word)

    for sub in substitution_selection:

        if sub.lower() not in word_count:
            continue
        else:
            sub_count = word_count[sub.lower()]

        # if sub_count<source_count:
        #     continue
        if isFast:
            if sub not in fasttext_dico:
                if sub.lower() not in fasttext_dico:
                    continue
                else:
                    sub_emb = fasttext_emb[fasttext_dico.index(sub.lower())].reshape(1, -1)
            else:
                sub_emb = fasttext_emb[fasttext_dico.index(sub)].reshape(1, -1)

            sis = cosine(source_emb, sub_emb)[0][0]

            # if sis<0.35:
            #    continue
            sis_scores.append(sis)

        ss.append(sub)
        count_scores.append(sub_count)

    return ss, sis_scores, count_scores

Example #15

0

Show file

def neighbors_predict(instance, collection, my_labels, wmd_or_tfidf):

    # compute WMD or cosine similarity between the new (never seen) instance and each instance in the collection

    if wmd_or_tfidf == 'wmd':
        sims = []
        for doc in collection:
            # ! wmdistance works on lists of strings (idx 1 in the tuples)
            sims.append(
                my_model.wmdistance(' '.join(instance[1]).lower().split(),
                                    ' '.join(doc[1]).lower().split()))
            # get indexes of elements sorted by INCREASING order (!distance)
            sorted_idx = sorted(range(len(sims)), key=lambda x: sims[x])

    elif wmd_or_tfidf == 'tfidf':
        # ! tfidf_vectorizer works on raw text (idx 0 in the tuples)
        doc_term_matrix = tfidf_vectorizer.fit_transform(
            [elt[0] for elt in collection])

        # note that we just transform
        # fitting has been done on the collection
        instance_vector = tfidf_vectorizer.transform([instance[0]])

        # computes cosine similarity between new instance and all elements in the collection
        sims = cosine(doc_term_matrix, Y=instance_vector,
                      dense_output=True).tolist()

        sims = [elt[0] for elt in sims]

        # get indexes of elements sorted by DECREASING order
        sorted_idx = sorted(range(len(sims)),
                            key=lambda x: sims[x],
                            reverse=True)

    predictions = []

    # we use odd numbers to break ties
    for k_nn in [3, 7, 11, 17]:
        # get labels of k_nn nearest neighbors
        nn_labels = [my_labels[i] for i in sorted_idx][:k_nn]

        # get most represented label
        counts = dict(Counter(nn_labels))
        max_counts = max(counts.values())
        prediction = [k for k, v in counts.iteritems() if v == max_counts][0]

        predictions.append(prediction)

    return predictions

Example #16

0

Show file

def pre_SR(source_word, substitution_selection, fasttext_dico, fasttext_emb,
           word_count):
    ss = []
    ##ss_score=[]
    sis_scores = []
    count_scores = []
    can = {}
    score = []
    isFast = True

    if (source_word not in fasttext_dico):
        isFast = False
    else:
        source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape(
            1, -1)

    #ss.append(source_word)

    for sub in substitution_selection:

        if sub not in word_count:
            continue
        else:
            sub_count = word_count[sub]

        if (sub_count <= 3):
            continue

        #if sub_count<source_count:
        #   continue
        if isFast:
            if sub not in fasttext_dico:
                continue

            token_index_fast = fasttext_dico.index(sub)
            sis = cosine(source_emb,
                         fasttext_emb[token_index_fast].reshape(1, -1))
            sis1 = sis[0, 0]
            score.append(sis1)
            #if sis<0.35:
            #    continue
            sis_scores.append(sis)

        ss.append(sub)
        count_scores.append(sub_count)
    can = dict(zip(ss, score))
    can = sorted(can.items(), key=lambda d: d[1], reverse=True)
    return can

Example #17

0

Show file

File: nn_utils.py Project: pmantica1/NLP

def test_auc_step(nn_model, batch):
    title1 = batch[ID1_TITLE_VEC]
    body1 = batch[ID1_BODY_VEC]

    title2 = batch[ID2_TITLE_VEC]
    body2 = batch[ID2_BODY_VEC]

    question1_vec = nn_model.evaluate(title1, body1).data.cpu().numpy()[:, :,
                                                                        0]
    question2_vec = nn_model.evaluate(title2, body2).data.cpu().numpy()[:, :,
                                                                        0]

    assert question1_vec.shape == question2_vec.shape
    scores = 1 - cosine(question1_vec, question2_vec)
    similarities = batch[SIMILARITY].cpu().numpy().flatten()
    return torch.FloatTensor(scores), torch.LongTensor(similarities)

Example #18

0

Show file

File: helpers.py Project: timt51/question_retrieval

def evaluate_tfidf(data, tfidf_vectors, query_to_index, eval_func):
    rrs = []
    for entry_id, eval_query_result in data.items():
        similar_ids = eval_query_result.similar_ids
        candidate_ids = eval_query_result.candidate_ids

        entry_encoding = tfidf_vectors[query_to_index[entry_id]]
        candidate_similarities = []
        for candidate_id in candidate_ids:
            candidate_encoding = tfidf_vectors[query_to_index[candidate_id]]
            similarity = cosine(entry_encoding, candidate_encoding)
            candidate_similarities.append((candidate_id, similarity))
        ranked_candidates = sorted(candidate_similarities, key=lambda x: x[1], reverse=True)
        ranked_candidates = [x[0] for x in ranked_candidates]
        rrs.append(eval_func(similar_ids, ranked_candidates))
    return np.mean(rrs)

Example #19

0

Show file

def preprocess_SR(source_word, substitution_selection, fasttext_dico,
                  fasttext_emb, word_count):
    ss = []
    ##ss_score=[]
    sis_scores = []
    count_scores = []
    source_count = 10
    if source_word in word_count:
        source_count = word_count[source_word]

    isFast = True

    if (source_word not in fasttext_dico):
        isFast = False
    else:
        source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape(
            1, -1)

    #ss.append(source_word)

    for sub in substitution_selection:

        if sub not in word_count:
            continue
        else:
            sub_count = word_count[sub]

        #if sub_count<source_count:
        #   continue
        if isFast:
            if sub not in fasttext_dico:
                continue

            token_index_fast = fasttext_dico.index(sub)
            sis = cosine(source_emb,
                         fasttext_emb[token_index_fast].reshape(1, -1))

            #if sis<0.35:
            #    continue
            sis_scores.append(sis)

        ss.append(sub)
        count_scores.append(sub_count)

    return ss, sis_scores, count_scores

Example #20

0

Show file

File: helpers.py Project: timt51/question_retrieval

def evaluate_tfidf_auc(data, tfidf_vectors, query_to_index):
    auc = AUCMeter()
    for entry_id, eval_query_result in data.items():
        similar_ids = eval_query_result.similar_ids
        positives = set(similar_ids)
        candidate_ids = eval_query_result.candidate_ids

        entry_encoding = tfidf_vectors[query_to_index[entry_id]]
        candidate_similarities = []
        targets = []
        for candidate_id in candidate_ids:
            candidate_encoding = tfidf_vectors[query_to_index[candidate_id]]
            similarity = cosine(entry_encoding, candidate_encoding)
            candidate_similarities.append(similarity.item(0))
            targets.append(IS_SIMMILAR_LABEL if candidate_id in positives else NOT_SIMMILAR_LABEL)

        similarities = torch.Tensor(candidate_similarities)
        auc.add(similarities, torch.Tensor(targets))
    return auc.value(MAXIMUM_FALSE_POSITIVE_RATIO)

Example #21

0

Show file

def predict_similarity(image1, image2):
    '''
	This function calculates the similarity of two images
	the model used is pretrained vgg on imagenet dataset
	we eliminate the last layers and take the output from
	last convolution layer then flattens it to calculate 
	similarity
	'''
    pretrained_model = VGG16(include_top=False,
                             weights="imagenet",
                             input_shape=(224, 224, 3))
    data1 = np.array(Image.open(image1).convert('RGB').resize((224, 224)))
    data2 = np.array(Image.open(image2).convert('RGB').resize((224, 224)))
    data1 = (data1.reshape(-1, 224, 224, 3) / 255).astype(np.float32)
    data2 = (data2.reshape(-1, 224, 224, 3) / 255).astype(np.float32)
    pred1 = pretrained_model.predict(data1).reshape(1, -1)
    pred2 = pretrained_model.predict(data2).reshape(1, -1)
    sim = cosine(pred1, pred2)
    print("similarity between images {} ".format(round(sim.item(0), 2)))
    return round(sim.item(0), 2)

Example #22

0

Show file

    def get_similarity(self, feat, stat, cls):
        max_id = -1
        max_cos = -1
        if stat:
            nID = self.id_count
        else:
            nID = self.id_count

        a = feat[None, :]
        b = self.embedding_bank[:nID, :]
        if len(b) > 0:
            alive = np.array(self.alive, dtype=np.int) - 1
            cosim = cosine(a, b)
            cosim = np.reshape(cosim, newshape=(-1))
            cosim[alive] = -2
            cosim[nID - 1] = -2
            cosim[np.where(self.cat_bank[:nID] != cls)[0]] = -2
            max_id = int(np.argmax(cosim) + 1)
            max_cos = np.max(cosim)
        return max_id, max_cos

Example #23

0

Show file

def raw_score_substitutions(source_word, substitution_selection, wv_dict,
                            wv_emb, word_count):
    """
        Scoring substitutions according to cosine similarity of word vectors to the replaced word's one
        and according to the counts
    """

    filtered_substitutions = []
    cosine_distance_scores = []
    count_scores = []

    is_fast = True
    source_emb = None

    # computing source word's word vector
    if source_word not in wv_dict:
        is_fast = False
        print("NOT FAST!")
    else:
        source_emb = wv_emb[wv_dict.index(source_word)].reshape(1, -1)

    for sub in substitution_selection:

        # skipping substitution not in word stats dictionary
        if sub in word_count:
            sub_count = word_count[sub]

            if is_fast:
                if sub not in wv_dict:
                    continue

                # computing substitution's word vector's distance to the source word's
                sub_embedding = wv_emb[wv_dict.index(sub)].reshape(1, -1)
                cosine_distance_scores.append(cosine(source_emb,
                                                     sub_embedding))

            filtered_substitutions.append(sub)
            count_scores.append(sub_count)

    return filtered_substitutions, cosine_distance_scores, count_scores

Example #24

0

Show file

File: baselines.py Project: pmantica1/NLP

def compute_baselines_part2():
    android_database = AndroidDatabase(use_count_vectorizer=True)
    validation_set = android_database.get_validation_dataset()
    testing_set = android_database.get_testing_dataset()

    metrics_list = []

    for dataset in (validation_set, testing_set):
        meter = metrics.AUCMeter()
        for query_pair in tqdm(dataset):
            question1_vec = (query_pair["id_1_title_vec"] +
                             query_pair["id_1_body_vec"]) / 2.0
            question2_vec = (query_pair["id_2_title_vec"] +
                             query_pair["id_2_body_vec"]) / 2.0
            score = (1 -
                     cosine(question1_vec.numpy(), question2_vec.numpy()))[0]
            similarity = query_pair["similarity"]

            meter.add(torch.FloatTensor([score]),
                      torch.LongTensor([similarity]))
        metrics_list.append(meter.value(0.05))

    return {"validation": metrics_list[0], "testing": metrics_list[1]}

Example #25

0

Show file

File: nn_utils.py Project: pmantica1/NLP

def test_step(nn_model, batch):
    questions_title_batch = batch[TITLE_VEC]
    questions_body_batch = batch[BODY_VEC]

    candidate_questions_title_batch = batch[CAND_TITLE_VECS]
    candidate_questions_body_batch = batch[CAND_BODY_VECS]

    similarity_vector_batch = batch[SIMILARITY_VEC].numpy()

    question_vector_batch = nn_model.evaluate(
        questions_title_batch, questions_body_batch).data.cpu().numpy()
    candidate_vector_batch = evaluate_multi_questions(
        nn_model, candidate_questions_title_batch,
        candidate_questions_body_batch).data.cpu().numpy()

    candidate_questions_vec = candidate_vector_batch[0]
    similarity_vector = similarity_vector_batch[0]
    question_vec = question_vector_batch[0].repeat(len(
        candidate_questions_vec[0]),
                                                   axis=1).swapaxes(1, 0)

    cosines = 1 - cosine(question_vec, candidate_questions_vec.swapaxes(1, 0))
    return cosines, similarity_vector

Example #26

0

Show file

File: word_embeddings.py Project: rohantilva/Wiki-Engine

 def __getSumVal(self, query, answer):
     terms1 = self.nlp(query)
     terms2 = self.nlp(answer)
     vector1 = []
     vector2 = []
     for index in range(300):
         vector1.append(0)
         vector2.append(0)
     for term in terms1:
         if term.has_vector:
             vector1 = [
                 term.vector[i] + vector1[i]
                 for i in range(len(term.vector))
             ]
     for term in terms2:
         if term.has_vector:
             vector2 = [
                 term.vector[i] + vector2[i]
                 for i in range(len(term.vector))
             ]
     vector1 = array(vector1).reshape(1, -1)
     vector2 = array(vector2).reshape(1, -1)
     return cosine(vector1, vector2)[0][0]

Example #27

0

Show file

File: test_cluster.py Project: moorejee/deepStocks

def get_excel_graph_model(datafeatures, opts):
    # datafeatures = np.loadtxt(opts['output']+'features_for_{}.txt'.format(opts['tradeday']), delimiter = ',')
    stock_num = np.shape(datafeatures)[0]
    with open(
            opts['output'] +
            'index_stocks_for_{}.txt'.format(opts['tradeday']), 'r') as f:
        stock_index = f.readlines()
    stock_name = []
    for i in range(len(stock_index)):
        find_code = False
        sl = re.split(r'[\/\\]+', stock_index[i])
        for j in range(len(sl)):
            if re.match(r'\d{6}', sl[j]):
                stock_name.append(sl[j])
                find_code = True
        assert find_code
    assert len(stock_name) == stock_num

    cos_similarity = cosine(datafeatures)
    # np.savetxt('./output/cosine_similarity_for_{}.txt'.format(opts['tradeday']), cos_similarity, delimiter=',')
    feature_ungraph = pd.DataFrame(columns=['vertex1', 'vertex2', 'weights'])
    vertex1 = []
    vertex2 = []
    weights = []
    for i in list(range(stock_num))[0:-1]:
        for j in list(range(stock_num))[i + 1:]:
            vertex1.append(stock_name[i])
            vertex2.append(stock_name[j])
            weights.append(cos_similarity[i, j])
    feature_ungraph['vertex1'] = vertex1
    feature_ungraph['vertex2'] = vertex2
    feature_ungraph['weights'] = weights
    feature_ungraph.to_excel(
        opts['output'] + 'feature_graph_for_{}.xls'.format(opts['tradeday']),
        sheet_name='sheet1')

    return feature_ungraph

Example #28

0

Show file

    def generate_clusters(self, to_cluster):

        instruments = list(to_cluster.keys())
        similarity_measures = dict(
        )  # key is 2 instruments, value is cosine similarity
        for i in range(0, len(instruments) - 1):
            for j in range(i + 1, len(instruments)):
                similarity_measures[tuple([
                    instruments[i], instruments[j]
                ])] = cosine(to_cluster[instruments[i]],
                             to_cluster[instruments[j]])[0][0]
        G = nx.Graph()
        G.add_nodes_from(instruments)

        for edge in similarity_measures.keys():
            edge_tuple = list(edge)
            # edge_tuple.append({'weight' : similarity_measures[edge]})
            G.add_edge(edge_tuple[0],
                       edge_tuple[1],
                       weight=similarity_measures[edge])

        print(G)
        communities = community.best_partition(G)
        return communities

Example #29

0

Show file

File: main.py Project: Philippe-Storiane/Altegrad-2020-2021

def my_cos_similarity(word1, word2):
    sim = cosine(Wt[vocab[word1], ].reshape(1, -1),
                 Wt[vocab[word2], ].reshape(1, -1))
    return round(float(sim), 4)

Example #30

0

Show file

File: tests.py Project: masdeval/NetFlix

import pandas as pd
import numpy as np

data = defaultdict(lambda: defaultdict(lambda: int))

dd = [2, 32, 53, 64]

for a in dd[1:3]:
    print(a)

user1 = np.array([2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0])
user2 = np.array([0, 0, 0, 0, 0, 0, 2, 3, 3, 4, 4, 3, 5])

#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity as cosine
print(cosine(user1.reshape(1, -1), user2.reshape(1, -1)))

#correlation
#print(np.corrcoef(user1, user2))

user1 = np.array([2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0])
user2 = np.array([5, 4, 5, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
#cosine similarity
from sklearn.metrics.pairwise import cosine_similarity as cosine
print(cosine(user1.reshape(1, -1), user2.reshape(1, -1)))
#correlation
#print(np.corrcoef(user1, user2))

user1 = np.array([
    2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0

Example #31

0

Show file

File: code.py Project: GuillaumeCarbajal/Link-Prediction-with-Citation-Network

def cosine_similarity(s_1, s_2): 
    #remove stopwords 
    s_1 = np.reshape(s_1,(1,-1)  )
    s_2 = np.reshape(s_2,(1,-1)  )
    return round(cosine(s_1,s_2), 5)