def create_coherency_features(sentences_index,ref_doc,input_query,model,key):
    query = input_query+key
    ref_doc_sentences = sentences_index[query][ref_doc]
    for top_doc in sentences_index[query]:
        if top_doc==ref_doc:
            continue
        top_doc_sentences = sentences_index[query][top_doc]
        for i,top_doc_sentence in enumerate(top_doc_sentences,start=1):
            sentence_vec = get_sentence_vector(top_doc_sentence,model)
            for j,ref_sentence in enumerate(ref_doc_sentences):
                row={}
                comb = top_doc+"_"+str(i)+"_"+str(j+1)
                window = []
                if j == 0:
                    # if j+1 == len(ref_doc_sentences):
                    #     window.append(get_sentence_vector(ref_doc_sentences[0], model))
                    #     window.append(get_sentence_vector(ref_doc_sentences[0], model))
                    # else:
                    window.append(get_sentence_vector(ref_doc_sentences[1], model))
                    window.append(get_sentence_vector(ref_doc_sentences[1], model))

                elif j+1 == len(ref_doc_sentences):
                    window.append(get_sentence_vector(ref_doc_sentences[j - 1], model))
                    window.append(get_sentence_vector(ref_doc_sentences[j - 1], model))
                else:
                    window.append(get_sentence_vector(ref_doc_sentences[j - 1], model))
                    window.append(get_sentence_vector(ref_doc_sentences[j + 1], model))
                ref_vector = get_sentence_vector(ref_sentence, model)
                row["docSimilarityToPrev"] = cosine_similarity(sentence_vec, window[0])
                row["docSimilarityToRefSentence"] = cosine_similarity(ref_vector, sentence_vec)
                row["docSimilarityToPred"] = cosine_similarity(sentence_vec, window[1])
                row["docSimilarityToPrevRef"] = cosine_similarity(ref_vector, window[0])
                row["docSimilarityToPredRef"] = cosine_similarity(ref_vector, window[1])
                write_files(row,query,comb)
def feature_values(centroid,s_in,s_out,past_winner_centroid):
    result={}
    result["docCosineToCentroidInVec"]= cosine_similarity(centroid,s_in)
    result["docCosineToCentroidOutVec"]= cosine_similarity(centroid,s_out)
    result["docCosineToWinnerCentroidInVec"]=cosine_similarity(past_winner_centroid,s_in)
    result["docCosineToWinnerCentroidOutVec"]=cosine_similarity(past_winner_centroid,s_out)

    return result
Ejemplo n.º 3
0
def get_memory_based_user_recommendation():
    request_data = json.loads(request.data)["user"]

    # Get User, Location entity is not implemented.
    # So We use, a csv file for this API.
    header = ['user_id', 'location_id', 'frequency']
    user_location_table = pd.read_csv('./data/preprocessed_data2.csv',
                                      sep='\t',
                                      names=header)
    n_users = user_location_table.user_id.unique().shape[0]
    n_locations = user_location_table.location_id.unique().shape[0]
    user_location_table = user_location_table.to_numpy()
    user_location_frequency_matrix = np.zeros((n_users, n_locations))
    # User x Location matrix
    # user_location_frequency_matrix can be replaced to user-match-location table JOIN.
    for checkin in user_location_table:
        user_location_frequency_matrix[checkin[0], checkin[1]] = checkin[2]

    # User x User matrix
    user_similarity = cosine_similarity(user_location_frequency_matrix)
    selected_user_similarity = user_similarity[request_data]

    # return top-10 similarity user's id
    # Ref : https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
    top10_id = selected_user_similarity.argsort()[-11:][::-1]  # cost : O(N)
    top10_id = top10_id[1:11]  # remove it self
    top10_sim = selected_user_similarity[top10_id]
    top10 = {'user_ids': top10_id.tolist(), 'user_sims': top10_sim.tolist()}
    return jsonify({'success': True, 'top10': top10}), HTTPStatus.OK
Ejemplo n.º 4
0
    def __init__(self, name, dimensions=128, alpha=0.1, beta=1, times=1, clusters=5, decay=0.1, order=3, max_iteration=300):
        print("Model initialization started.\n")
        self.input = u"../data/{}.csv".format(name)
        self.name = name
        self.lambd = math.pow(10, 8)
        self.dimensions = dimensions
        self.alpha = alpha
        self.beta = beta
        self.times = times
        self.clusters = clusters
        self.order = order
        self.decay = decay
        self.max_iteration = max_iteration
        self.converge_threshold = math.pow(10, -3)
        self.lower_control = math.pow(10, -8)

        self.G = graph_reader(self.input)
        self.number_of_nodes = len(nx.nodes(self.G))

        self.S = cosine_similarity(self.G)  # cosine similarity
        self.Adj = np.array(nx.adjacency_matrix(self.G).todense())  # adjacency Matrix
        self.P = self.high_order_proximity()  # high-order proximity

        self.current_loss = math.pow(10, 10)
        self.round = 0
        self.V = self.matrix_random_initialization(self.number_of_nodes, self.dimensions)
        self.U = self.matrix_random_initialization(self.number_of_nodes, self.dimensions)
        self.H = self.matrix_random_initialization(self.number_of_nodes, self.clusters)
        self.W = self.matrix_random_initialization(self.clusters, self.dimensions)
    def predict(self, Xtest):
        #Compute the Euclidean distance
        N, D = self.X.shape
        T, D = Xtest.shape

        y_pred = np.zeros((T, self.y.shape[1]))
        if self.method == "L2":
            distance = utils.euclidean_dist_squared(self.X, Xtest)
        elif self.method == "cosine":
            distance =  utils.cosine_similarity(self.X, Xtest)
            #print(distance.shape)
        elif self.method == "pearson":
            distance = utils.pearson_corr(self.X, Xtest)
            #print(distance.shape)
        for t in range(T):
            sorted_distance_k =  np.argsort(distance[:, t])[:self.k]
            #print(sorted_distance_k)
            for l in range(self.labels):
                #calculate the conditional probability that P(y_j = 1|x)
                p = (1/self.k)*np.sum(self.y[:,l][sorted_distance_k])
                #print(p)
                if p>0.5:
                    y_pred[t,l] = 1
                else:
                    y_pred[t, l] = 0
        	    #y_pred[t] = utils.mode(self.y[sorted_distance_k] )
        	
        return y_pred
Ejemplo n.º 6
0
def sentiment_similarity_trg(train_line,
                             trg_idxs,
                             word_vectors,
                             sentiment_vectors,
                             method='cosine',
                             lower=True):
    '''
    Returns a sentiment matrix for consisting of similarity of the word to pretrained sentiment vectors
    
    TODO, it's possible to add a mask for targets here (exclude and replace by '2.0') 
    so we know where the sentiment is in relation to the target
    '''
    seq = []
    words = train_line.strip().split(' ')
    for w in words:
        if lower:
            w = w.lower()
        if w in word_vectors:
            #print("found word", w)
            sim_vec = np.array([
                cosine_similarity(sentiment_vectors[label], word_vectors[w])
                for label in sentiment_vectors
            ])
            #print(sim_vec)
            seq.append(sim_vec)
        else:
            seq.append(np.array([0.0, 0.0, 0.0]))

    return seq
Ejemplo n.º 7
0
 def _find_similar(self, roll_no, x, threshold, verbose):
     if roll_no != x:
         cs = cosine_similarity(self.df.loc[roll_no], self.df.loc[x])
         if cs < threshold:
             if verbose:
                 print(f"{roll_no} and {x} has a cosine similarity of {cs}")
             return True
Ejemplo n.º 8
0
    def softmax_embedding_loss(self, x, y, proxies):
        idx = torch.from_numpy(np.arange(len(x), dtype=np.int)).cuda()
        diff_iZ = cosine_similarity(x, proxies)

        numerator_ip = torch.exp(diff_iZ[idx, y] / self.temperature)
        denominator_ip = torch.exp(diff_iZ / self.temperature).sum(1) + 1e-8
        return -torch.log(numerator_ip / denominator_ip)
Ejemplo n.º 9
0
def similarity_correlation(pairs, similarity_scores, embeddings, word2index):
    """
    Returns pearson and spearman correlation coefficient between human judgement
    and ranking defined by cosine similarity of embeddings

    pairs: list of pairs of words
    similarity_scores: the corresponding human similarity judgement scores of pairs
    embeddings: the embedding matrix
    word2index: a dictionary mapping words to (row) indices of the embedding matrix
    """

    scores = list(similarity_scores)
    not_in_embedds_count = 0
    cosine_sims = []
    for idx, pair in enumerate(pairs):
        w1, w2 = pair

        try:
            cosine_sim = cosine_similarity(embeddings[word2index[w1]],
                                           embeddings[word2index[w2]])
            cosine_sims.append(cosine_sim)
        except KeyError:
            not_in_embedds_count += 1
            scores.pop(idx)

    # correlations
    pearson_r, p_pearson = pearsonr(cosine_sims, scores)
    spearman_r, p_spearman = spearmanr(cosine_sims, scores)

    return pearson_r, spearman_r, p_pearson, p_spearman, not_in_embedds_count
Ejemplo n.º 10
0
    def subgraph_to_leaf_vector(self, pred_subgraph_vector, strategy, redundancy_removal=False):

        weights = self.get_node_weights(redundancy_removal=redundancy_removal)
        leaf_node_vec = np.zeros(shape=[self.num_leafs])
        subgraph_mat = np.zeros(shape=[self.num_leafs, len(weights)])

        for subgraph in self.subgraphs.values():
            subgraph_idx = self.get_subgraph_vector_index(subgraph["wd_id"], redundancy_removal=redundancy_removal)
            subgraph_vector = self.get_subgraph_vector(subgraph["wd_id"], redundancy_removal=redundancy_removal)

            leaf_node_vec[subgraph["class_idx"]] = pred_subgraph_vector[subgraph_idx]
            subgraph_mat[subgraph["class_idx"], :] = subgraph_vector

        if strategy == "leafprob":
            return leaf_node_vec

        if "cossim" in strategy:
            cos_sim = cosine_similarity(gt=subgraph_mat, prediction=pred_subgraph_vector, weights=weights)

            if strategy == "cossim":
                return cos_sim
            elif strategy == "leafprob*cossim":
                return leaf_node_vec * cos_sim

        logging.error("Unknown ont2cls strategy. Exiting!")
        return None
Ejemplo n.º 11
0
Archivo: nlp.py Proyecto: ertosns/aipy
def get_country(city1, country1, city2, embeddings):
    # store the city1, country 1, and city 2 in a set called group
    group = set((city1, country1, city2))
    # get embeddings of city 1
    city1_emb = embeddings[city1]
    # get embedding of country 1
    country1_emb = embeddings[country1]
    # get embedding of city 2
    city2_emb = embeddings[city2]
    # get embedding of country 2 (it's a combination of the embeddings of country 1, city 1 and city 2)
    # Remember: King - Man + Woman = Queen
    vec = city2_emb - city1_emb+country1_emb
    # Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1)
    similarity = -1
    # initialize country to an empty string
    country = ''
    # loop through all words in the embeddings dictionary
    for word in embeddings.keys():
        # first check that the word is not already in the 'group'
        if word not in group:
            # get the word embedding
            word_emb = embeddings[word]
            # calculate cosine similarity between embedding of country 2 and the word in the embeddings dictionary
            cur_similarity = cosine_similarity(word_emb, vec)
            # if the cosine similarity is more similar than the previously best similarity...
            if cur_similarity > similarity:
                # update the similarity to the new, better similarity
                similarity = cur_similarity
                # store the country as a tuple, which contains the word and the similarity
                country = (word, similarity)
    return country
Ejemplo n.º 12
0
    def HeadUpdate(self, ControlState, PrevWeights, Memory, IsWrite=False):
        """
        For one head, takes the control state, previous weight and memory, and outputs
        the new weight, and for write-heads, the erase and add vectors as well.
        """
        KeyVector = tf.tanh(
            Linear(ControlState, self.Params.MemoryDepth, 'KeyVector'))
        KeyStrength = tf.nn.softplus(Linear(ControlState, 1, 'KeyStrength'))
        Gate = tf.sigmoid(Linear(ControlState, 1, 'Gate'))
        ShiftWeights = tf.nn.softmax(
            Linear(ControlState, len(self.Params.ShiftOffsets),
                   'ShiftWeights'))
        Sharpen = tf.nn.softplus(Linear(ControlState, 1, 'Sharpen')) + 1.

        Weights = tf.exp(KeyStrength * cosine_similarity(KeyVector, Memory))
        Weights /= tf.reduce_sum(Weights, 1)
        Weights = Gate * Weights + (1.0 - Gate) * PrevWeights
        Weights = circular_convolution(Weights, ShiftWeights,
                                       self.Params.ShiftOffsets)
        Weights = tf.pow(Weights, Sharpen)
        Weights /= tf.reduce_sum(Weights, 1)

        if IsWrite:
            Erase = tf.sigmoid(
                Linear(ControlState, self.Params.MemoryDepth, 'Erase'))
            Add = tf.tanh(Linear(ControlState, self.Params.MemoryDepth, 'Add'))
            return Weights, Erase, Add
        else:
            return Weights
Ejemplo n.º 13
0
def retrieve(K, query, tfidf_docs, text_docs):
    distances = {}
    for doc in tfidf_docs:
        distances[doc] = cosine_similarity(query, tfidf_docs[doc])
    for k in range(K):
        key = max(distances, key=distances.get)
        print(str((k+1)) + ". " + key + " - " + text_docs[key] + " Score: " + str(distances[key]))
        distances.pop(key)
Ejemplo n.º 14
0
    def classify(self, x, proxies):
        idx = torch.from_numpy(np.arange(len(x), dtype=np.int)).cuda()
        diff_iZ = cosine_similarity(x, proxies)

        numerator_ip = torch.exp(diff_iZ[idx, :] / self.temperature)
        denominator_ip = torch.exp(diff_iZ / self.temperature).sum(1) + 1e-8

        probs = numerator_ip / denominator_ip[:, None]
        return probs
Ejemplo n.º 15
0
    def get_similarity(self, data):
        print("Computing Item similarity...")
        similarity = utils.cosine_similarity(data,
                                             alpha=self.alpha,
                                             asym=self.asym,
                                             h=self.h,
                                             dtype=np.float32)

        # ARTIST
        similarity += utils.cosine_similarity(
            self.artists_mat, alpha=0.5, asym=True, h=0,
            dtype=np.float32) * self.artist_w
        # ALBUM
        similarity += utils.cosine_similarity(
            self.albums_mat, alpha=0.5, asym=True, h=0,
            dtype=np.float32) * self.album_w

        similarity = utils.knn(similarity, self.knn)
        return similarity
Ejemplo n.º 16
0
def sentence_similarity(target, pred, similarity_model):
    """Calculates the cosine similarity between the sentence embeddings of a target and predicted pair 
        using the embedding model specified"""
    try:
        cosine_sim = utils.cosine_similarity(
            similarity_model.sentence_embedding(target).view(1, -1),
            similarity_model.sentence_embedding(pred).view(1, -1))
        return np.around(cosine_sim.item(), 4)
    except:
        print('similarity rejected sentence: ', pred)
        return 0.01
Ejemplo n.º 17
0
 def get_diversity(self, tweet, index, ranking):
     is_in = tweet.id in {id for _, id in ranking}
     count = (len(ranking) - (1 if is_in else 0))
     if count == 0:
         return 0
     v1 = index.get_tf_idf_vector(tweet.id)
     total = 0
     for _, t in ranking:
         total += cosine_similarity(v1, index.get_tf_idf_vector(t))
     return 1 - (
         total / count
     )  # if tweet is in ranking, we take mean over len(ranking)-1, otherwise, over all len(ranking)
Ejemplo n.º 18
0
def find_similar_items(fv, threshold_similar=0.6, num_similar_vectors=3):
    if len(_labels) <= 0:
        return []

    results = []
    for k, v in _labels.items():
        score = utils.cosine_similarity(
            fv, np.array(v['featureVectors'], dtype='float32'))
        if score >= threshold_similar:
            results.append((k, score))

    return sorted(results, key=lambda x: x[1],
                  reverse=True)[:num_similar_vectors]
Ejemplo n.º 19
0
 def score(self, tweet: Tweet, query, normalize=True):
     tweet_v = []
     query_v = []
     tweet_tf = self.index.get_tf(query, tweet.id)
     query_tf = query.get_tf()
     idfs = self.index.get_idf(query)
     for i in range(len(query_tf)):
         tweet_v.append(tweet_tf[i] * idfs[i])
         query_v.append(query_tf[i] * idfs[i])
     return cosine_similarity(tweet_v,
                              query_v,
                              norm_a=normalize,
                              norm_b=normalize)  # assume get_tf normalizes
def predictSentence(sentence, vectorSentimentValues):
    # xây dựng vector biễu diễn trọng số cảm xúc
    inp = vectorSentimentValues.values.reshape((len(dbFeatures), ))
    G = inp
    P = (G > 0) * G  # < 0 to take positive words
    N = (G < 0) * G  # > ) to take negative words

    # print(G, P, N)
    if (np.count_nonzero(G) == 0):
        return "câu chưa biết"
    elif (np.count_nonzero(P) == 0):
        return "câu chê"
    elif (np.count_nonzero(N) == 0):
        return "câu khen"
    else:
        positiveCos = ut.cosine_similarity(P, G)
        negativeCos = ut.cosine_similarity(N, G)
        if positiveCos > negativeCos:
            return "câu khen"
        elif positiveCos == negativeCos:
            return "câu bình thường"
        else:
            return "câu chê"
Ejemplo n.º 21
0
def get_top_k_suggestions(data, intent_utils, search_postings, k=2):
    query = data["message"]
    query_tokens = utils.lemmatize_text(query.lower())
    query_tokens_set = set(query_tokens)
    stemmed_query_tokens = utils.stem_text(query.lower())
    for token in stemmed_query_tokens:
        if token not in query_tokens_set:
            query_tokens = query_tokens + [token]
    print "lemmatized, stemmed and stripped query tokens: " + json.dumps(query_tokens)
    # remove stop words
    # query_tokens = utils.remove_stop_words(query_tokens, input_type="list")
    results = []

    # trigger
    # the trigger shall control whether to use cosine similarity or just a sum of scores
    trigger = True

    if trigger:
        # initializations
        unique_q_tokens_with_frequencies = dict()
        postings_vocab = search_postings.get_vocabulary()
        postings_word_mapping = search_postings.get_vocabulary(return_type="dict")
        query_vector = [0] * len(postings_vocab)
        doc_set = set()

        # get tf in query
        # and get a doc set
        for q_token in query_tokens:
            freq = unique_q_tokens_with_frequencies.get(q_token, 0)
            unique_q_tokens_with_frequencies[q_token] = freq + 1
            if search_postings.get_token(q_token):
                doc_set = doc_set.union(set(map(lambda x: x["id"], search_postings.get_token(q_token).doc_list)))

        for q_token in query_tokens:
            # for this token, get the idf
            token_obj = search_postings.get_token(q_token)
            if token_obj:
                # compute tf-idf
                idf = token_obj.features["idf"]
                q_tf_idf = unique_q_tokens_with_frequencies[q_token] * idf
                # store in query vector
                query_vector[postings_word_mapping[q_token]] = q_tf_idf

        # compute cosine similarity for each doc
        for doc_id in list(doc_set):
            results.append([doc_id, utils.cosine_similarity(search_postings.doc_term_tf_idf[doc_id], query_vector)])

        # return the top k results
        sorted_results = sorted(results, key=lambda x:x[1], reverse=True)[:k]
        return map(lambda x: x[0], sorted_results)
Ejemplo n.º 22
0
def get_test_results(sample_predictions, OntReader):
    node_results = {}

    for sample in sample_predictions.values():
        # get sample ground truth
        gt_class_idx = sample["gt_leaf_class_idx"]
        gt_subgraph_vector = OntReader.get_subgraph_vector(
            sample["gt_leaf_wd_id"])
        gt_subgraph_nodes = OntReader.get_subgraph_nodes(
            sample["gt_leaf_wd_id"])

        # get sample prediction
        pred_leaf_node_vector = sample["leaf_node_vector"]
        pred_subgraph_vector = sample["subgraph_vector"]

        # calculate metrics
        accuracy = top_k_accuracy(gt_class_idx,
                                  pred_leaf_node_vector,
                                  kvals=[1, 3, 5])
        jaccard = jaccard_similarity(gt_subgraph_vector, pred_subgraph_vector)
        cosine = cosine_similarity(gt_subgraph_vector, pred_subgraph_vector)

        # set results for each node in the subgraph of the gt leaf event node
        for node in gt_subgraph_nodes:
            if node["wd_id"] not in node_results:
                node_results[node["wd_id"]] = {
                    "wd_id": node["wd_id"],
                    "wd_label": node["wd_label"],
                    "num_test_images": 0,
                    "metrics": {
                        "accuracy-top1": 0,
                        "accuracy-top3": 0,
                        "accuracy-top5": 0,
                        "jaccard": 0,
                        "cosine": 0,
                    },
                }

            node_results[node["wd_id"]]["num_test_images"] += 1
            node_results[
                node["wd_id"]]["metrics"]["accuracy-top1"] += accuracy[0]
            node_results[
                node["wd_id"]]["metrics"]["accuracy-top3"] += accuracy[1]
            node_results[
                node["wd_id"]]["metrics"]["accuracy-top5"] += accuracy[2]
            node_results[node["wd_id"]]["metrics"]["jaccard"] += jaccard
            node_results[node["wd_id"]]["metrics"]["cosine"] += cosine

    return node_results
def sentiment_similarity(train_line, word_vectors, sentiment_vectors, method='cosine', lower = True):
    '''
    Returns a sentiment matrix for consisting of similarity of the word to pretrained sentiment vectors
    '''
    seq = []
    words = train_line.strip().split(' ')
    for w in words:
        if lower:
            w = w.lower()
        if w in word_vectors:
            sim_vec = np.array([cosine_similarity(sentiment_vectors[label], word_vectors[w]) for label in sentiment_vectors])
            seq.append(sim_vec)
        else:
            seq.append(np.array([0.0, 0.0, 0.0]))
    return seq
Ejemplo n.º 24
0
def compute_men_spearman(dm_dict, annotation_file):
    pairs, humans = readMEN(annotation_file)
    system_actual = []
    human_actual = []
    count = 0
    for i in range(len(pairs)):
        human = humans[i]
        a, b = pairs[i]
        if a in dm_dict and b in dm_dict:
            cos = utils.cosine_similarity(dm_dict[a], dm_dict[b])
            system_actual.append(cos)
            human_actual.append(human)
            count += 1
    sp = spearman(human_actual, system_actual)
    return sp, count
Ejemplo n.º 25
0
    def _init_similarity(self, item, another_item):
        """
        Description
            A function which computes and returns a similarity
            between a pair of items.

        Arguments
            :param item: The first item.
            :type item: int
            :param another_item: The second item.
            :type another_item: int
        """
        return cosine_similarity(
            self.intersections_between(item, another_item),
            self.l1_norm_of(item), self.l1_norm_of(another_item))
Ejemplo n.º 26
0
def compile_similarity_lists(dm_dict, annotation_file):
    pairs, humans = readMEN(annotation_file)
    system_actual = []
    human_actual = []
    eval_pairs = []

    for i in range(len(pairs)):
        human = humans[i]
        a, b = pairs[i]
        if a in dm_dict and b in dm_dict:
            cos = utils.cosine_similarity(dm_dict[a], dm_dict[b])
            system_actual.append(cos)
            human_actual.append(human)
            eval_pairs.append(pairs[i])

    return eval_pairs, human_actual, system_actual
 def push(self, tweet: Tweet):
     tid = tweet.id
     vector = self.scorer.get_tweet_vector(tweet)
     self.vectors[tid] = vector
     score = self.scorer.score(tweet, self.query)
     self.scores[tid] = score
     total = 0
     div = {}
     for t, tdiv in self.div.items():
         d = cosine_similarity(vector, self.vectors[t])
         div[t] = d
         tdiv[tid] = d
         total += d
     self.div[tid] = div
     self.total_div[tid] = total
     heappush(self.ranking, (score, tid))
def runScript(query_dist, dm_dict, pears_ids):
    best_pears = []

    #############################################################
    #Calculate score for each pear in relation to the user query
    #############################################################

    if len(query_dist) > 0:
        pears_scores = {}
        for pear_name, v in pears_ids.items():
            scoreSIM = 0.0  #Initialise score for similarity
            score = cosine_similarity(np.array(v), query_dist)
            if not isnan(score):
                pears_scores[pear_name] = score
                print pear_name, score
        best_pears = outputBestPears(pears_scores)
    return best_pears
Ejemplo n.º 29
0
 def test_cosine_similarity(self):
     sim = cosine_similarity(0, 5, 5)
     sim1 = cosine_similarity(1, 5, 5)
     sim2 = cosine_similarity(2, 5, 5)
     sim3 = cosine_similarity(3, 5, 5)
     sim4 = cosine_similarity(4, 5, 5)
     sim5 = cosine_similarity(5, 5, 5)
     self.assertAlmostEqual(sim, 0, delta=0.01)
     self.assertAlmostEqual(sim1, 0.2, delta=0.01)
     self.assertAlmostEqual(sim2, 0.4, delta=0.01)
     self.assertAlmostEqual(sim3, 0.6, delta=0.01)
     self.assertAlmostEqual(sim4, 0.8, delta=0.01)
     self.assertAlmostEqual(sim5, 1.0, delta=0.01)
Ejemplo n.º 30
0
 def evaluate_auc(self, X_test, Y_test):
     test_size = len(X_test)
     Y_true = [int(i) for i in Y_test]
     Y_score = []
     for i in range(test_size):
         start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape(
             -1, 1)
         end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape(
             -1, 1)
         score = cosine_similarity(start_node_emb,
                                   end_node_emb)  # ranging from [-1, +1]
         Y_score.append(score)
     if len(Y_true) == 0:
         print(
             '------- NOTE: two graphs do not have any change -> no testing data -> set result to 1......'
         )
         auc = 1.0
     else:
         auc = auc_score(y_true=Y_true, y_score=Y_score)
     print("cos sim; auc=", "{:.9f}".format(auc))
Ejemplo n.º 31
0
def scoreDS(query_dist,url_dict):
    DS_scores={}
    for url,doc_dist in url_dict.items():
    	score = cosine_similarity(doc_dist, query_dist)
    	DS_scores[url] = score
    return DS_scores
Ejemplo n.º 32
0
 def test_cosine_similarity(self):
     self.assertEqual(1.0, ut.cosine_similarity([0,1], [0,1]))
     self.assertEqual(0.0, ut.cosine_similarity([1,0], [0,1]))
     self.should_raise_for_diff_len_args(ut.cosine_similarity, [0, 1], [0])
     self.should_raise_for_diff_len_args(ut.cosine_similarity, [0], [])