Ejemplo n.º 1
0
def contextual_full_lijing(s, T, delta=0.9, lamb=0.1):
    assert not s.cascade
    theta = np.zeros(s.d)
    beta = 0
    V = lamb * np.eye(s.d)
    b = np.zeros(s.d)
    reward = [0]
    regret = [0]
    for t in range(1, T):
        x, params = s.new()
        U = {arm: theta.dot(x[arm]) + beta * x[arm].dot(np.linalg.inv(V)).dot(x[arm]) for arm in x}
        recc = s.oracle(U, *params)
        r, ctr, z = s.play(recc)
        V += sum([(s.gamma) ** (2 * k) * np.outer(x[recc[k]], x[recc[k]]) for k in range(len(recc))])
        b += sum([ctr[k] * x[recc[k]] for k in range(len(recc))])
        theta = np.linalg.inv(V).dot(b)
        beta = np.sqrt(s.d * np.log((1 + t * s.L) / delta)) + np.sqrt(lamb)
        reward.append(reward[-1] + r)
        regret.append(regret[-1] + z)
    logger.info("Lijing play reward {0}/{1}".format(reward[-1], T))
    if s.theta is not None:
        logger.info("theta cosine similarity {0}".format(1 - cosine(s.theta, theta)))
        similarity = 1 - cosine(s.theta, theta)
    else:
        similarity = None
    if s.regret is True:
        logger.info("regret {0}/{1}".format(regret[-1], T))
    return reward, regret, similarity
Ejemplo n.º 2
0
def similarity(arg1, test_infile):
    with smart_open(test_infile, "r") as f:
        test = json.load(f)
    gold = np.array([float(x[2]) for x in test])
    # we're given a tuple: matrix-vector composition
    if isinstance(arg1,tuple):
        if len(arg1) == 2:
            lf = arg1[0]
            emb = arg1[1]
            # agumented matrices
            if lf.A.shape[2] == lf.A.shape[1]+1:
                ours = np.array([1-cosine(
                    np.dot(lf.word(x[0][0]),np.hstack((emb.word(x[0][1]),[1]))),
                    np.dot(lf.word(x[1][0]),np.hstack((emb.word(x[1][1]),[1])))) for x in test])
            # standard matrices
            else:
                ours = np.array([1-cosine(
                    np.dot(lf.word(x[0][0]),emb.word(x[0][1])),
                    np.dot(lf.word(x[1][0]),emb.word(x[1][1]))) for x in test])
            return spearmanr(gold,ours)
        return TypeError("Invalid input format")
    # we're only given embeddings: do cosine similarity of vectors
    elif isinstance(arg1,Embeddings):
        ours = np.array([1-cosine(arg1.word(x[0]),arg1.word(x[1])) for x in test])
        return spearmanr(gold,ours)
    return TypeError("Invalid input format")
Ejemplo n.º 3
0
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'):
    """ Compute the specified distance type between chanels of mean vector and query image.
    In caffe library, FC8 layer consists of 10 channels. Here, we compute distance
    of distance of each channel (from query image) with respective channel of
    Mean Activation Vector. In the paper, we considered a hybrid distance eucos which
    combines euclidean and cosine distance for bouding open space. Alternatively,
    other distances such as euclidean or cosine can also be used. 
    
    Input:
    --------
    query_channel: Particular FC8 channel of query image
    channel: channel number under consideration
    mean_vec: mean activation vector

    Output:
    --------
    query_distance : Distance between respective channels

    """

    if distance_type == 'eucos':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel)
    elif distance_type == 'euclidean':
        query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200.
    elif distance_type == 'cosine':
        query_distance = spd.cosine(mean_vec[channel, :], query_channel)
    else:
        print "distance type not known: enter either of eucos, euclidean or cosine"
    return query_distance
Ejemplo n.º 4
0
def main(directory, inputfile_name, output_filename):
    semitones = []
    noteList = [[0 for x in range(12)] for x in range(6)]
    collect = audio.AudioQuantumList()
    final = audio.AudioQuantumList()
    #createAllNotes()
    initNoteList(noteList)
    print len(noteList)
    print noteList[0][0].analysis.segments.timbre
    audiofile = audio.LocalAudioFile(input_filename)
    songSegments = audiofile.analysis.segments
    bmp = 10000.0
    bmpi = 0
    bmt = 10000.0
    bmti = 0
    #print len(songSegments)
    for i in range(len(songSegments)):
        for j in range(12):
            noteSegments = noteList[0][j].analysis.segments
            pDist = distFinder.cosine(songSegments[i].pitches, noteSegments[len(noteSegments) / 2].pitches)
            if pDist < bmp:
                bmp = pDist
                bmpi = j
        for k in range(6):
            noteSegments = noteList[k][bmpi].analysis.segments
            tDist = distFinder.cosine(songSegments[i].timbre[1], noteSegments[len(noteSegments) / 2].timbre[1])
            if tDist < bmt:
                bmt = tDist
                bmti = k 
        print str(i / len(songSegments)) + '%'
        matchDuration(noteList[bmti][bmpi].analysis.segments, songSegments[i], collect)
        bmp = 10000.0
        bmt = 10000.0
    out = audio.assemble(collect)
    out.encode(output_filename)
Ejemplo n.º 5
0
def contextual_cascading_gsherry(s, T, delta=0.9, lamb=0.1, gamma=None):
    assert gamma
    assert s.cascade
    theta = np.zeros(s.d)
    beta = 0
    V = lamb * np.eye(s.d)
    ldV = np.linalg.slogdet(V)[1]
    X = np.zeros((1, s.d))
    Y = np.zeros(1)
    reward = [0]
    regret = [0]
    timestamp = time.time()
    for t in range(1, T):
        x, params = s.new()
        U = {arm: theta.dot(x[arm]) + beta * x[arm].dot(np.linalg.inv(V)).dot(x[arm]) for arm in x}
        recc = s.oracle(U, *params[:-1], gamma)
        r, c, z = s.play(recc)
        V += sum([gamma ** (2 * k) * np.outer(x[recc[k]], x[recc[k]]) for k in range(min(len(recc), c + 1))])
        X = np.concatenate([X] + [gamma ** k * x[recc[k]].reshape(1, s.d) for k in range(min(len(recc), c + 1))])
        Y = np.concatenate([Y] + [gamma ** k * ((k == c) == s.disj) * np.ones(1) for k in range(min(len(recc), c + 1))])
        theta = np.linalg.inv(X.T.dot(X) + lamb * np.eye(s.d)).dot(X.T.dot(Y))
        beta = np.sqrt(np.linalg.slogdet(V)[1] - ldV - 2 * np.log(delta)) + np.sqrt(lamb)
        reward.append(reward[-1] + r)
        regret.append(regret[-1] + z)
    logger.info("Sherry play reward {0}/{1}, gamma={2}".format(reward[-1], T, gamma))
    if s.theta is not None:
        logger.info("theta cosine similarity {0}".format(1 - cosine(s.theta, theta)))
        similarity = 1 - cosine(s.theta, theta)
    else:
        similarity = None
    if s.regret is True:
        logger.info("regret {0}/{1}".format(regret[-1], T))
    return reward, regret, similarity
Ejemplo n.º 6
0
 def doc_similarity(self):
     sim_matrix = [[0 for _ in xrange(self.D.shape[1])] for _ in xrange(self.D.shape[1])]
     for row, vec1 in enumerate(self.D):
         for col, vec2 in enumerate(self.D):
             print cosine(vec1, vec2)
             sim_matrix[row][col] = cosine(vec1, vec2)
     return sim_matrix
Ejemplo n.º 7
0
def print_transform_similarities(dictionary, transform_mtx, mtx_from, mtx_to):
    tr_tr = transform_mtx.transpose()
    for i, word in enumerate(dictionary):
        src = mtx_from[i]
        tgt = mtx_to[i]
        dist1 = 1 - cosine(transform_mtx.dot(src), tgt)
        dist2 = 1 - cosine(tr_tr.dot(tgt), src)
        print(u'{0}\t{1}\t{2}'.format(word, dist1, dist2).encode('utf8'))
Ejemplo n.º 8
0
def least_cos_dist(sent_emb, mark2ee):
    pred = 'marker_0'
    dist = 1.
    for key, value in mark2ee.iteritems():
        if cosine(sent_emb, value) < dist:
            pred = key
            dist = cosine(sent_emb, value)
    return pred
Ejemplo n.º 9
0
 def weight_cost_func_bow_topics(self, s, t):
     a1 = numpy.array(s['bow'].todense()).ravel()
     a2 = numpy.array(t['bow'].todense()).ravel()
     if not a1.any() or not a2.any():
         d = 1
     else:
         d = cosine(a1, a2)
     return (0.2 * cosine(s['topics'], t['topics']) +
             0.8 * d)
Ejemplo n.º 10
0
 def testCosDistance(self):
   n1 = numpy.array([[1., 2., 3., 4.], [1., 1., 1., 1.]], dtype=numpy.float32)
   n2 = numpy.array([[5., 6., 7., -8.], [1., 1., 1., 1.]], dtype=numpy.float32)
   out = self.Run(functions.cos_distance(n1, n2))
   testing.assert_allclose(
       out[0],
       numpy.array(
           [distance.cosine(n1[0], n2[0]), distance.cosine(n1[1], n2[1])]),
       rtol=TOLERANCE)
 def compute_similarity(self,user1, user2):
     '''
     given two users and compute their distance
     '''
     h1_sim=1-cosine(user1['h1'],user2['h1'])
     h2_sim=1-cosine(user1['h2'],user2['h2'])
     h3_sim=1-cosine(user1['h3'],user2['h3'])
     return h1_sim,h2_sim,h3_sim
     pass
Ejemplo n.º 12
0
 def testCosDistanceWithBroadcast(self):
   n1 = numpy.array([[[1., 2., 3., 4.], [1., 1., 1., 1.]], [[5., 6., 7., 8.],
                                                            [1., 1., 1., 2.]]],
                    dtype=numpy.float32)
   n2 = numpy.array([[5., 6., 7., -8.], [1., 1., 1., 1.]], dtype=numpy.float32)
   out = self.Run(functions.cos_distance(n1, n2))
   expected = numpy.array(
       [[distance.cosine(n1[0, 0], n2[0]), distance.cosine(n1[0, 1], n2[1])],
        [distance.cosine(n1[1, 0], n2[0]), distance.cosine(n1[1, 1], n2[1])]])
   testing.assert_allclose(expected, out[0], atol=TOLERANCE)
Ejemplo n.º 13
0
def triplet(w1, w2, w3):
    try:
        sim12 = 1 - cosine(myembed[word2idx[w1], :], myembed[word2idx[w2], :])
        sim13 = 1 - cosine(myembed[word2idx[w1], :], myembed[word2idx[w3], :])
    except KeyError:
        print "Word %s is not present in Vocablury" % sys.exc_value
        return

    print "Similarity between " + w1 + " and " + w2 + " --> " + str(sim12)
    print "Similarity between " + w1 + " and " + w3 + " --> " + str(sim13)
Ejemplo n.º 14
0
 def append_que(que, p_xws, p_d, n_xws, n_ds, xf, xt, debug, model):
     for xw, d in zip(n_xws, n_ds):
         que.append((p_xws + [xw], p_d + d))
         if debug:
             for x in p_xws + [xw]:
                 print(u'{} '.format(x), end='')
                 df = ssd.cosine(xf, model[xw])
                 dt = ssd.cosine(xt, model[xw])
                 print(u'\t d:{0:.3f} (di:{1:.3f}, dt:{2:.3f})'.
                       format(p_d + d, df, dt))
     return que
Ejemplo n.º 15
0
def simplePredictFor2(review, lexicons):
	feature0 = loadFeature(FEATURE0)
	feature4 = loadFeature(FEATURE4)
	feature = feature0+feature4
	vector = getReviewVector(review, feature, lexicons)
	distances = {}
	d0 = distance.cosine(vectors[0],vector)
	distances[d0] = 0
	d4 = distance.cosine(vectors[4],vector)
	distances[d4] = 1
	d = min(d0,d4)
	return distances[d]
Ejemplo n.º 16
0
def compTriplet(w1, w2, w3):
    embed = loadmat('embedding.txt')
    vocab = load_word2idx('word2idx.json')

    try:
        sim12 = 1 - cosine(embed[vocab[w1], :], embed[vocab[w2], :])
        sim13 = 1 - cosine(embed[vocab[w1], :], embed[vocab[w3], :])
    except KeyError:
        print "Word %s is not present in Vocablury" % sys.exc_value
        exit()

    print "Similarity between " + w1 + " and " + w2 + " --> " + str(sim12)
    print "Similarity between " + w1 + " and " + w3 + " --> " + str(sim13)
Ejemplo n.º 17
0
    def test_multidimensional_dtw_cosine(self):

        a = np.array([[1,2,3, np.nan], [7,8,9,np.nan]]).T
        b = np.array([[10,12,14], [13,15,17]]).T

        # DTW should match the points:
        # (1,7) to (10,13)
        # (2,8) to (12,15)
        # (3,9) to (14,17)

        #
        cosine_distance = cosine(a[0], b[0]) + cosine(a[1], b[1]) + cosine(a[2], b[2])
        self.assertAlmostEqual(cosine_distance, dtw_std(a, b, metric='cosine'))
Ejemplo n.º 18
0
def add(tIdx, sIdx, contexts):
    cos = cosine(wvecs[tIdx], wvecs[sIdx])

    ncontexts = 0
    for context in contexts:
        if context in c2vec:
            cIdx = c2vec[context]
            cos += cosine(cvecs[cIdx], wvecs[sIdx])
            ncontexts += 1

    cos = (cos/(ncontexts + 1.0))

    return cos
Ejemplo n.º 19
0
def np_loss_cos_cos2(x1, x2, y):
    assert x1.shape[0] == x2.shape[0] == y.shape[0]
    losses = []
    for i in xrange(x1.shape[0]):
        if y[i] == 1:
            # Data points are the same, use cosine distance
            loss = distance.cosine(x1[i], x2[i]) / 2.
            losses.append(loss)
        elif y[i] == 0:
            # Data points are different, use cosine similarity squared
            loss = (distance.cosine(x1[i], x2[i]) - 1)**2
            losses.append(loss)
        else:
            assert False
    return numpy.mean(losses)
Ejemplo n.º 20
0
	def rerank(self, cands, t):
		vt = None
		try:
			vt = self.model[t]
			vt = self.selector.transform(vt)
		except Exception:
			return cands
	
		fcands = []
		vcands = []
		for cand in cands:
			try:
				vec = self.model[cand]
				vec = self.selector.transform(vec)
				dist = cosine(vt, vec)
				fcands.append(cand)
				vcands.append(dist)
			except Exception:
				pass
		
		distmap = {}
		for i in range(0, len(fcands)):
			c = fcands[i]
			d = vcands[i]
			distmap[c] = d
	
		ranking_data = sorted(distmap.keys(), key=distmap.__getitem__)
	
		return ranking_data
Ejemplo n.º 21
0
def similarity(word_list, id2row):
    test_file = 'word-test.v1.txt'
    test_list = []
    for word, vector in zip(id2row, word_list):
        word_dict[word] = vector
    with open(os.path.join('../corpus', test_file)) as f:
        for line in f:
            abcd = line.strip().split()
            test_list.append(abcd)

    total = len(test_list)
    right = 0
    for test in test_list:
        vec1 = test[0]
        vec2 = test[1]
        vec3 = test[2]
        vec4 = test[3]
        vec_real = vec1 - vec2 + vec3
        dist = sys.maxsize
        vec_cand = []
        for word in word_list:
            vector = word_dict[word]
            curr_dist = cosine(vec_cand, vector)
            if curr_dist < dist:
                dist = curr_dist
                vec_cand = vector
        if np.array_equal(vec_cand, vec_real):
            right += 1
    print(right)
Ejemplo n.º 22
0
 def get_dis_corr(self):
     if len(self.oracle_sim) != len(self.gen_sim):
         raise ArithmeticError
     corr = 0
     for index in range(len(self.oracle_sim)):
         corr += (1 - cosine(np.array(self.oracle_sim[index]), np.array(self.gen_sim[index])))
     return np.log10(corr / len(self.oracle_sim))
def average_cosine_distance(user_hash, coupon_vector, train_coupon_hash_to_vector_dict, user_hash_to_train_coupon_list, user_buy_and_view, user_hash_to_day_probability):
    if user_hash not in user_hash_to_train_coupon_list:
        train_coupon_list = []
    else:
        train_coupon_list = user_hash_to_train_coupon_list[user_hash]

    if len(train_coupon_list) == 0:
        return 1.0

    sum_cosine_distance = 0.0
    train_coupon_list = user_hash_to_train_coupon_list[user_hash]

    arr = numpy.array([0] * len(train_coupon_hash_to_vector_dict[train_coupon_list[0]]))

    # i = 0
    # while i < len(train_coupon_list):
    #     vec = train_coupon_hash_to_vector_dict[train_coupon_list[i]]
    #     i += 1
    #     arr = arr + log10(1 + train_coupon_list[i]) * numpy.array(vec)
    #     i += 1
    for i in range(0, len(train_coupon_list)):
        if user_hash in user_buy_and_view and train_coupon_list[i] in user_buy_and_view[user_hash]:
            view = user_buy_and_view[user_hash][train_coupon_list[i]]['view']
        #     buy = user_buy_and_view[user_hash][train_coupon_list[i]]['buy']
            arr = arr + user_hash_to_day_probability[user_hash][train_coupon_list[i]] * view * numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]])
        # else:
        #     arr = arr + numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]])
        # arr = arr + numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]])



    arr = arr / float(len(train_coupon_list))


    return cosine(arr, coupon_vector)
Ejemplo n.º 24
0
 def similarity_matrix(self,transpose=False):
     """
     computes the similarity matrix using cosine similarity
     IN
         transpose (bool) default=False, whether to transpose the adjacency matrix
                         determines similarity between customers or products
     OUT
         matrix of similarity scores
     """
     if transpose:
         # take the transpose of A (see numpy documentation)
         A = self.A.T
         # dimension of the matrix D
         dim = self.n
     else:
         # dimension of the matrix D
         A = self.A.copy()
         dim = self.m
     print('start time')
     print(time.ctime())
     print('computing...')
     start = time.time()
     # lil_matrix allows us to efficiently store data in memory (see scipy documentation)
     D = lil_matrix((dim,dim))
     for i in xrange(dim):
         if i % 1000 == 0:
             print(float(i)/dim)
         for j in xrange(dim):
             # compute cosine similarity between row i and column j
             D[i,j] = -1*(cosine(A[i],A[j]) - 1)
     # convert D to a dense (rather than sparse) matrix
     self.D = D.todense()
     end = time.time()
     t = end-start
     print('finished in %d seconds') % t
Ejemplo n.º 25
0
 def __init__(self, data, card_names):
     all_dists = []
     dists_per_card = []
     N = len(data)
     for i in range(N):
         dists_for_i = []
         for j in range(N):
             if i != j:
                 dist = distance.cosine(data[i], data[j])
                 dists_for_i.append((dist, card_names[j]))
                 all_dists.append(dist)
     dists_for_i.sort()
     dists_per_card.append(dists_for_i)
     # print card_names[i], ':', ', '.join(
     # [n for (d, n) in dists_for_i][:10])
     all_dists.sort()
     self.interesting_quantiles = [0.005, 0.01, 0.02, 1.0]
     self.interesting_dist_cutoffs = [all_dists[int(q * N)] for q in self.interesting_quantiles]
     self.dists_per_card = dists_per_card
     self.card_names = card_names
     self.partitions_by_card = {}
     for card_name, dists_list in zip(self.card_names, self.dists_per_card):
         card_partitions = [list() for i in xrange(len(self.interesting_quantiles))]
         for dist, ocard in dists_list:
             for idx, quant in enumerate(self.interesting_quantiles):
                 if dist < quant:
                     card_partitions[idx].append(ocard)
         self.partitions_by_card[card_name] = card_partitions
            def calc(self):
                from numpy import append
                from scipy.spatial.distance import cosine

                if self.sim_type == 'lda_cosine':
                    for i in range(len(self.key_list)-1):
                        for j in range(i+1, len(self.key_list)):
                            d = cosine(self.corpus[self.key_list[i]], self.corpus[self.key_list[j]])
                            out_tag = self.key_list[i] + '_' + self.key_list[j]

                            self.calculated[out_tag] = d
                            self.raw[out_tag] = append(self.corpus[self.key_list[i]],self.corpus[self.key_list[j]])

                elif self.sim_type == 'word2vec_cosine':


                elif sim_type == 'kernel':
                    from PyML import sequenceData

                    docs = [self.corpus[key] for key in sorted(self.key_list)]

                    kernel = sequenceData.spectrum_data(docs, k)
                    mat = kernel.getKernelMatrix()

                    for i in range(len(docs)):
                        for j in range(i+1, len(docs)):
                            tag = self.key_list[i] + '_' + self.key_list[j]
                            self.calculated[tag] = mat[i][j]

                else:
                    raise KeyError('Please check your similarity type!')
Ejemplo n.º 27
0
def calc_dist(e1,e2,mode=1):
    if mode == 1:
        return ssd.euclidean(e1,e2)
    elif mode == 2:
        return ssd.cityblock(e1,e2)
    elif mode == 3:
        return ssd.cosine(e1,e2)
Ejemplo n.º 28
0
    def _cosine(self, word, n=10):
        """
        Test method for cosine distance using `scipy.distance.cosine`

        Note: This method is **a lot** slower than `self.cosine`
        and results are the almost the same, you should be using `self.cosine`

        Requires: `__init__(..., save_memory=False)`

        Parameters
        ----------
        word : string
            word in the vocabulary to calculate the vectors
        n : int, optional (default 10)
            number of neighbors to return
        """
        from scipy.spatial import distance

        target_vec = self[word]
        metric = np.empty(self.vocab.shape)
        for idx, vector in enumerate(self.vectors):
            metric[idx] = distance.cosine(target_vec, vector)
        best = metric.argsort()[1:n + 1]

        return self.generate_response(best, metric)
    def get(self, height, weight, city, state):
        # combined = pd.read_csv("wonderfullyamazingdata.csv", encoding='ISO-8859-1')
        combined = pd.read_csv("newamazingdata.csv", encoding='ISO-8859-1')
        location =  str(city) + ' ' + str(state)
        geolocator = Nominatim()
        place = geolocator.geocode(location[0])
        latitude = place.latitude
        longitude = place.longitude


        users = [float(height), float(weight), latitude, longitude ]
        players = combined[["height", "weight", "latitude", "longitude"]]

        result = []
        for index in range(0,len(players)):

            result.append(1-distance.cosine(users, players.iloc[index]))

        result = sorted(range(len(result)), key=lambda i: result[i])[-5:]   
        result.reverse()

        ids = []
        for index in result:
            ids.append( combined.ID.iloc[index] )


        ids = str(ids)


        with open('reply.json', 'w') as outfile:
            json_stuff = json.dumps(ids)
            json.dump(json_stuff, outfile)

        return json_stuff
Ejemplo n.º 30
0
    def get_nearest_words(target_vec, k=20):
        """Summary

        Parameters
        ----------
        target_vec : TYPE
            Description
        k : int, optional
            Description

        Returns
        -------
        TYPE
            Description
        """
        # Get distances to target vector
        dists = []
        for vec_i in wordvecs:
            dists.append(distance.cosine(target_vec, vec_i))
        # Get top nearest words
        idxs = np.argsort(dists)
        res = []
        for idx_i in idxs[:k]:
            res.append((words[idx_i], dists[idx_i]))
        return res
Ejemplo n.º 31
0
def create_user_version(tA, tB):
    user_A_ratings = random.randrange(
        0, 101)  # The number of movies this version of user A rated
    user_B_ratings = random.randrange(
        0, 101)  # The number of movies this version of user B rated

    A_indexes = []  # The indexes of the movies for A
    B_indexes = []  # The indexes of the movies for B
    A_version = []  # This version of A
    B_version = []  # This version of B

    for m in range(user_A_ratings):  # Get a random rating from A
        temp_index = random.randrange(0, len(tA))
        while (temp_index in A_indexes):
            temp_index = random.randrange(0, len(tA))
        A_indexes.append(temp_index)
        A_version.append(tA[temp_index])  # Fill the list of this version

    for m in range(user_B_ratings):  # Get a random rating from B
        temp_index = random.randrange(0, len(tB))
        while (temp_index in B_indexes):
            temp_index = random.randrange(0, len(tB))
        B_indexes.append(temp_index)
        B_version.append(tB[temp_index])  # Fill the list of this version

    intersection_count = list(
        set(A_indexes) & set(B_indexes)
    )  # How many items belong to the intersection = number of common indexes
    intersection_elements = set(A_indexes) & set(
        B_indexes)  # Common element indexes
    intersections.append(len(intersection_count))
    A_common = []
    B_common = []

    for x in range(
            len(intersection_elements)):  # Create the lists of common elements
        tp = intersection_elements.pop()
        A_common.append(tA[tp])
        B_common.append(tB[tp])

    # Here we compute  Pearson Correlation
    A_version_adj = list(A_common)
    B_version_adj = list(B_common)

    Pearsons.append(
        pearsonr(A_common, B_common)
        [0])  # this uses only the common elements without the mean subtraction

    # Here we compute the Adjusted Cosine Similarity
    full_tA = []
    full_tB = []
    for y in range(len(tA)):  # Create full tables
        if ((y in A_indexes) and (y in B_indexes)):
            full_tA.append(tA[y])
            full_tB.append(tB[y])
        if ((y in A_indexes) and (y not in B_indexes)):
            full_tA.append(tA[y])
            full_tB.append(0)
        if ((y not in A_indexes) and (y in B_indexes)):
            full_tA.append(0)
            full_tB.append(tB[y])

    for i in range(
            len(full_tA)):  # Adjusting vectors by subtracting their means
        full_tA[i] = float(full_tA[i]) - float(mean(full_tA))
    for i in range(
            len(full_tB)):  # Adjusting vectors by subtracting their means
        full_tB[i] = float(full_tB[i]) - float(mean(full_tB))

    Cosines.append(1 - cosine(full_tA, full_tB))
Ejemplo n.º 32
0
def cosine((x, y)):
    return distance.cosine(x, y)
Ejemplo n.º 33
0
    # Drop any column named "user"
    data_item_base = data.drop('user', 1)
    data = data.drop('user', 1)
    print(data_item_base.ix[:, 0:10])
    # store DataFrame
    data_item_base_frame = pd.DataFrame(index=data_item_base.columns,
                                        columns=data_item_base.columns)

    print(data_item_base_frame.head(6).ix[:, 0:6])
    # Calculate similarily
    for i in range(0, len(data_item_base_frame.columns)):
        # Loop through the columns for each column
        for j in range(0, len(data_item_base_frame.columns)):
            # Calculate similarity
            # print (i , " and ", j)
            data_item_base_frame.ix[i, j] = 1 - cosine(data.ix[:, i],
                                                       data.ix[:, j])

    data_item_base_frame.to_csv('data_item_base_frame.csv')
    # data_item_base_frame = pd.read_csv('data_item_base_frame.csv')
    print(data_item_base_frame.ix[:, 0:5])

    # Initial a frame for save closes neighbors to an item
    data_neighbors = pd.DataFrame(index=data_item_base_frame.columns,
                                  columns=range(1, 6))

    # Order by similarity
    for i in range(0, len(data_item_base_frame.columns)):
        data_neighbors.ix[i, :5] = data_item_base_frame.ix[0:, i].sort_values(
            ascending=False)[:5].index

    data_neighbors.ix[:, 0:5].to_csv('dataresult.csv')
Ejemplo n.º 34
0
def explore_embedding_space(embedding_fn: str,
                            out_fn: str,
                            num_samples=1000) -> None:
    """
	Calculate the following statistics for each layer of the model:
	1. mean cosine similarity between a sentence and its words
	2. mean dot product between a sentence and its words
	3. mean word embedding norm
	4. mean cosine similarity between randomly sampled words
	5. mean dot product between randomly sampled words
	6. mean variance explained by first principal component for a random sample of words

	num_samples sentences/words are used to estimate each of these metrics. We randomly sample words
	by first uniformly randomly sampling sentences and then uniformly randomly sampling a single word
	from each sampled sentence. This is because:
		- 	When we say we are interested in the similarity between random words, what we really 
			mean is the similarity between random _word occurrences_ (since each word has a unique 
			vector based on its context).
		- 	By explicitly sampling from different contexts, we avoid running into cases where two
			words are similar due to sharing the same context.

	Create a dictionary mapping each layer to a dictionary of the statistics write it to out_fn.
	"""
    f = h5py.File(embedding_fn, 'r')
    num_layers = f["0"].shape[0]
    num_sentences = len(f)

    sentence_indices = random.sample(list(range(num_sentences)), num_samples)

    mean_cos_sim_between_sent_and_words = {
        f'layer_{layer}': []
        for layer in range(num_layers)
    }
    mean_cos_sim_across_words = {
        f'layer_{layer}': -1
        for layer in range(num_layers)
    }
    word_norm_std = {f'layer_{layer}': -1 for layer in range(num_layers)}
    word_norm_mean = {f'layer_{layer}': -1 for layer in range(num_layers)}
    variance_explained_random = {
        f'layer_{layer}': -1
        for layer in range(num_layers)
    }

    for layer in Tqdm.tqdm(range(num_layers)):
        word_vectors = []
        word_norms = []
        mean_cos_sims = []
        mean_dot_products = []

        for sent_index in sentence_indices:
            # average word vectors to get sentence vector
            sentence_vector = f[str(sent_index)][layer].mean(axis=0)
            num_words = f[str(sent_index)].shape[1]

            # randomly add a word vector (not all of them, because that would bias towards longer sentences)
            word_vectors.append(
                f[str(sent_index)][layer,
                                   random.choice(list(range(num_words)))])

            # what is the mean cosine similarity between the sentence and its words?
            mean_cos_sim = np.nanmean([
                1 - cosine(f[str(sent_index)][layer, i], sentence_vector)
                for i in range(num_words)
                if f[str(sent_index)][layer, i].shape != ()
            ])
            mean_cos_sims.append(round(mean_cos_sim, 3))

            # what is the mean embedding norm across words?
            word_norms.extend([
                np.linalg.norm(f[str(sent_index)][layer, i])
                for i in range(num_words)
            ])

        mean_cos_sim_between_sent_and_words[f'layer_{layer}'] = round(
            float(np.mean(mean_cos_sims)), 3)
        mean_cos_sim_across_words[f'layer_{layer}'] = round(
            np.nanmean([
                1 - cosine(random.choice(word_vectors),
                           random.choice(word_vectors))
                for _ in range(num_samples)
            ]), 3)
        word_norm_std[f'layer_{layer}'] = round(float(np.std(word_norms)), 3)
        word_norm_mean[f'layer_{layer}'] = round(float(np.mean(word_norms)), 3)

        # how much of the variance in randomly chosen words can be explained by their first principal component?
        pca = TruncatedSVD(n_components=100)
        pca.fit(word_vectors)
        variance_explained_random[f'layer_{layer}'] = min(
            1.0, round(float(pca.explained_variance_ratio_[0]), 3))

    json.dump(
        {
            'mean cosine similarity between sentence and words':
            mean_cos_sim_between_sent_and_words,
            'mean cosine similarity across words': mean_cos_sim_across_words,
            'word norm std': word_norm_std,
            'word norm mean': word_norm_mean,
            'variance explained for random words': variance_explained_random
        },
        open(out_fn, 'w'),
        indent=1)
Ejemplo n.º 35
0
def embedding_average(ref, hypo):
    ref_avg = np.sum(ref, axis=0) * 1.0 / ref.shape[1]
    hypo_avg = np.sum(hypo, axis=0) * 1.0 / hypo.shape[1]
    return 1 - cosine(ref_avg, hypo_avg)
Ejemplo n.º 36
0
def strangeness_cosine(x, samples):
    omega_hist = representative_sample(samples)
    return cosine(x, omega_hist)
Ejemplo n.º 37
0
    def cos_distance(self, X, Y):
        assert (len(X.shape) == 1)
        assert (len(X) == len(Y[0]))

        return [cosine(X, v) for v in Y]
Ejemplo n.º 38
0
    def __calc_distances__(self, v1s, v2s, is_sparse=True):
        if is_sparse:
            dcosine = np.array([
                cosine(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))
            dcityblock = np.array([
                cityblock(x.toarray(), y.toarray())
                for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))
            dcanberra = np.array([
                canberra(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))
            deuclidean = np.array([
                euclidean(x.toarray(), y.toarray())
                for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))
            dminkowski = np.array([
                minkowski(x.toarray(), y.toarray(), 3)
                for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))
            dbraycurtis = np.array([
                braycurtis(x.toarray(), y.toarray())
                for (x, y) in zip(v1s, v2s)
            ]).reshape((-1, 1))

            dskew_q1 = [skew(x.toarray().ravel()) for x in v1s]
            dskew_q2 = [skew(x.toarray().ravel()) for x in v2s]
            dkur_q1 = [kurtosis(x.toarray().ravel()) for x in v1s]
            dkur_q2 = [kurtosis(x.toarray().ravel()) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) -
                                np.array(dskew_q2)).reshape((-1, 1))
            dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape(
                (-1, 1))
        else:
            dcosine = np.array([cosine(x, y)
                                for (x, y) in zip(v1s, v2s)]).reshape((-1, 1))
            dcityblock = np.array(
                [cityblock(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1, 1))
            dcanberra = np.array([canberra(x, y)
                                  for (x, y) in zip(v1s, v2s)]).reshape(
                                      (-1, 1))
            deuclidean = np.array(
                [euclidean(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1, 1))
            dminkowski = np.array(
                [minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape(
                    (-1, 1))
            dbraycurtis = np.array(
                [braycurtis(x, y) for (x, y) in zip(v1s, v2s)]).reshape(
                    (-1, 1))

            dskew_q1 = [skew(x) for x in v1s]
            dskew_q2 = [skew(x) for x in v2s]
            dkur_q1 = [kurtosis(x) for x in v1s]
            dkur_q2 = [kurtosis(x) for x in v2s]

            dskew_diff = np.abs(np.array(dskew_q1) -
                                np.array(dskew_q2)).reshape((-1, 1))
            dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape(
                (-1, 1))
        return np.hstack((dcosine, dcityblock, dcanberra, deuclidean,
                          dminkowski, dbraycurtis, dskew_diff, dkur_diff))
Ejemplo n.º 39
0
     bK1 = bK.split('\t')
     for x in bK1:
         bKs.append(
             float(x))  #list of all the 'k'mer counts for bacteria
     # d1.append(float(distance.euclidean(vKs,bKs)))
     # d2.append(float(distance.braycurtis(vKs,bKs)))
     # d3.append(float(distance.correlation(vKs,bKs)))
     # d4.append(float(distance.cityblock(vKs,bKs)))
     # d5.append(float(distance.chebyshev(vKs,bKs)))
     # d6.append(float(distance.cosine(vKs,bKs)))
     d1.append(distance.euclidean(vKs, bKs))
     d2.append(distance.braycurtis(vKs, bKs))
     d3.append(distance.correlation(vKs, bKs))
     d4.append(distance.cityblock(vKs, bKs))
     d5.append(distance.chebyshev(vKs, bKs))
     d6.append(distance.cosine(vKs, bKs))
     #hamming distance is only useful if we make them boolean
     #d7.append(float(distance.hamming(vKs1,bKs1)))
     #Z = numpy.vstack([vKs,bKs])
     #A=numpy.cov(Z, rowvar=False)
     #B=inv(A)
     #d7.append(float(distance.mahalanobis(vKs,bKs,B)))
     bKs = []
     vKs = []
     #bKs1=[]
     #vKs1=[]
 nexd1 = '\t'.join(str(v) for v in d1)
 nexd2 = '\t'.join(str(v) for v in d2)
 nexd3 = '\t'.join(str(v) for v in d3)
 nexd4 = '\t'.join(str(v) for v in d4)
 nexd5 = '\t'.join(str(v) for v in d5)
Ejemplo n.º 40
0
#testing cosine similarity , use a random normal distribution
cosSimTest = np.zeros((numSamplesToDraw,1))
for i in range(numSamplesToDraw):
    cosSimTest[i] = 1-dist.cosine(np.random.normal(size=X_hat.shape[2]), np.random.normal(size=X_hat.shape[2]))
 '''
    
#Cos Sim Within Same Objs
numSamplesToDraw=200
distWithinObjs = np.zeros((numSamplesToDraw,1))
sampleObjIDs_1 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_1 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_2 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_1 = sampleTransIDs_1+5
sampleTransIDs_2 = sampleTransIDs_2+5
for i in range(numSamplesToDraw):
    distWithinObjs[i] = 1-dist.cosine(X_hat[sampleObjIDs_1[i],sampleTransIDs_1[i],:],X_hat[sampleObjIDs_1[i],sampleTransIDs_2[i],:])
sampledMean_within = np.mean(distWithinObjs)
 
#Cosine Similarity Across Objects
distBetweenObjs = np.zeros((numSamplesToDraw,1))
sampleObjIDs_1 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True)
sampleObjIDs_2 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_1 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_2 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True)
sampleTransIDs_1 = sampleTransIDs_1+5
sampleTransIDs_2 = sampleTransIDs_2+5
for i in range(numSamplesToDraw):
    distBetweenObjs[i] = 1-dist.cosine(X_hat[sampleObjIDs_1[i],sampleTransIDs_1[i],:],X_hat[sampleObjIDs_2[i],sampleTransIDs_2[i],:])
sampledMean_between=np.mean(distBetweenObjs)

 
Ejemplo n.º 41
0
import numpy

lignes = eval(open('lines.txt', 'r').read())
colonnes = eval(open('colonnes.txt', 'r').read())

nblignes = len(lignes.values())
nbcolonnes = 1471

matrix = numpy.zeros((nblignes, nbcolonnes))

for fichier, x in lignes.items():

    f = open("CorpusPreproc/" + fichier, encoding="utf-8").read()
    for mot, y in colonnes.items():
        nb = f.count(mot)
        matrix[int(x) - 1][int(y) - 1] = nb

from scipy.spatial import distance
print(distance.cosine(matrix[5, :], matrix[6, :]))
def get_similarity(repr1, repr2):
    return 1 - cosine(repr1, repr2)
Ejemplo n.º 43
0
import pickle
from scipy.spatial.distance import cosine

# Loads the dictionary of prepositions and their vectors from file, as saved in the Jupyter Notebook.
with open('11d_vecs.pkl', 'rb') as loadfile:
    prep_bbox_11d_vecs = pickle.load(loadfile)

prepositions = ['over', 'above', 'below', 'under']

# Gets all cosine distance scores for each preposition and saves the list of scores to file.
# This was applied to the three first prepositions (above, below, over), but the last (under) was too large and required its own workaround script.
for p in prep_bbox_11d_vecs:
    vecs = prep_bbox_11d_vecs[p]

    prep_cos = [
        cosine(u, v) for i, u in enumerate(vecs) for j, v in enumerate(vecs)
        if i > j
    ]

    lst_filename = p + "_list.pkl"

    with open(lst_filename, 'wb') as outfile:
        pickle.dump(prep_cos, outfile, pickle.HIGHEST_PROTOCOL)

    # Progress indication print, with example results.
    print("Done with:", p)
def cosine_distance(x1, x2):
    return distance.cosine(x1, x2)
Ejemplo n.º 45
0
def get_sentence_cos(original_text, title):
    word2vec = get_word2vec(word2vec_path)
    stop_words = get_stop_words(stopwords_path)
    sentences = split_sentence(original_text)
    if sentences == []:
        raise NameError
    sentences_cut = [cut(''.join(token(n))) for n in sentences]
    sentences_cut_del_stopwords = []

    is_title = False
    # 处理标题
    if title:
        title_cut = [cut(''.join(token(title)))]
        words = title_cut[0].split()
        title_cut_del_stopwords = list(set(words) - set(stop_words))
        if title_cut_del_stopwords != []:
            is_title = True

    for s in sentences_cut:
        words = s.split()
        sentence_cut_del_stopwords = list(set(words) - set(stop_words))
        if sentence_cut_del_stopwords != []:
            sentences_cut_del_stopwords.append(sentence_cut_del_stopwords)

    if sentence_cut_del_stopwords == []:
        raise NameError

    # 得到文章向量,句子向量
    sentences_vec = []
    additional_wordvec = {}
    text_vec = np.zeros_like(word2vec.wv['测试'])
    for i, sentence in enumerate(sentences_cut_del_stopwords):
        sentence_vec = np.zeros_like(word2vec.wv['测试'])
        for word in sentence:
            if word in word2vec.wv.vocab:
                sentence_vec += word2vec.wv[word]
            elif word in additional_wordvec:
                sentence_vec += additional_wordvec[word]
            else:
                additional_wordvec[word] = np.random.random(
                    word2vec.wv['测试'].shape)
        sentence_vec = sentence_vec / len(sentence)
        # 第一句话比较重要,说三遍
        if i == 0:
            text_vec += sentence_vec * 3
        else:
            text_vec += sentence_vec
        sentences_vec.append(sentence_vec)

    if is_title:
        title_vec = np.zeros_like(word2vec.wv['测试'])
        for word in title_cut_del_stopwords:
            if word in word2vec.wv.vocab:
                title_vec += word2vec.wv[word]
            elif word in additional_wordvec:
                title_vec += additional_wordvec[word]
            else:
                additional_wordvec[word] = np.random.random(
                    word2vec.wv['测试'].shape)

    text_vec += text_vec * 3
    # 多加了两遍第一句话,三遍title
    text_vec /= len(sentences) + 5

    # 求句子向量与文章向量的cosine
    sentences_cos = {}
    for i, sentence_vec in enumerate(sentences_vec):
        sentences_cos[i] = cosine(sentence_vec, text_vec)
    return sentences, sentences_cos
Ejemplo n.º 46
0
ch_n.head()


# In[28]:


# Chebyshev distance, standardized_dataset
ch_s = standardized_dataset.apply(lambda x: chebyshev(x, P_std), axis=1)
ch_s.head()


# In[29]:


# Cosine Distance, raw_dataset
cos_r = raw_dataset.apply(lambda x: cosine(x, P_raw), axis=1)
cos_r.head()


# In[30]:


# Cosine Distance, normalized_dataset
cos_n = normalized_dataset.apply(lambda x: cosine(x, P_norm), axis=1)
cos_n.head()


# In[31]:


# Cosine Distance, standardized_dataset
Ejemplo n.º 47
0
def dist(hash_1: np.ndarray, hash_2: np.ndarray) -> float:
    return distance.cosine(hash_1, hash_2)
Ejemplo n.º 48
0
    return np.asarray(X), lmbda


if __name__ == '__main__':
    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    nodes = data.load_digits()
    true_clusters = data.load_digits_clusters()
    report = []
    num_nodes = nodes.shape[0]
    tm = []

    print "Create edges from cosine similarity..."
    if not os.path.exists("outputs/q7/intermediates/cosine.npy"):
        for idx1 in range(num_nodes):
            for idx2 in range(idx1 + 1, num_nodes):
                tm.append([idx1, idx2, cosine(nodes[idx1], nodes[idx2])])
        tm = np.asarray(tm)
        np.save('outputs/q7/intermediates/cosine.npy', tm)
    else:
        tm = np.load('outputs/q7/intermediates/cosine.npy')
    print "Done"

    print "Finding best lambda..."
    X, lmbda = calculate_lambda_edges(nodes, tm)
    report += [
        "Saving output lambda finder graph to outputs/q7/lambda-finder.png..."
    ]
    print report[-1]
    report += [
        "Lambda vs #isolate nodes --> " + np.array2string(X, separator=",")
    ]
Ejemplo n.º 49
0
# Step 4: PPMI matrix
p_wc = M1 / M1.sum()
p_w = M1.sum(axis=0) / M1.sum()
p_c = M1.sum(axis=1) / M1.sum()
M1_plus = np.nan_to_num(threshold(np.log(p_wc / np.outer(p_w, p_c)), threshmin=0))


# Step 5: SVD
U, s, V = svd(M1_plus)
M2_10 = U[:,:10]
M2_50 = U[:,:50]
M2_100 = U[:,:100]


# Step 6: human scores
S = rg65

# Step 7: model scores
S_hat = {}
models = ['M1', 'M1_plus', 'M2_10', 'M2_50', 'M2_100']

for model in models:
	M = eval(model)
	S_hat[model] = np.nan_to_num([ 1 - cosine(M[word_index[a],:], M[word_index[b],:]) for i,(a,b,_) in S.iterrows() ])


# Step 8: pearson R
for model in models:
	print model, np.nan_to_num(pearsonr(S['score'].tolist(), S_hat[model])[0])	
Ejemplo n.º 50
0
def cosine_distance(u, v):
    return distance.cosine(u, v)
Ejemplo n.º 51
0
 def cosine(self):
     a = self.target
     b = self.library
     return 1 - distance.cosine(a, b)
Ejemplo n.º 52
0
metrics = Metrics(args.input)
words = list(metrics.encoding_word_indices.keys())

alpha = .02
theta = .2

if args.dispersion:
    correct = 0
    sum = 0
    for i in tqdm(range(len(wbless.pairs))):
        w1, w2 = wbless.pairs[i]
        true_label = wbless.truth_vals[i]
        if w1 not in words or w2 not in words:
            continue
        entails = 0
        cossim = 1 - cosine(metrics.mean(w1), metrics.mean(w2))
        if cossim > theta:
            s = 1 - ((metrics.dispersion(w1) + alpha) / metrics.dispersion(w2))
            if s > 0:
                entails = 1
        correct += int(entails == true_label)
        sum += 1
    print('Dispersion: {0}'.format(correct / float(sum)))

if args.centroid:
    correct = 0
    sum = 0
    for i in tqdm(range(len(wbless.pairs))):
        w1, w2 = wbless.pairs[i]
        true_label = wbless.truth_vals[i]
        if w1 not in words or w2 not in words:
Ejemplo n.º 53
0
def test_cosine_regression_loss():
    logging.debug('Testing cosine regression loss')

    np.random.seed(1234)
    model = CosineRegressionDirectionGetter(3)

    logging.debug("  - Identical vectors x: expecting -1")
    a = np.array([1, 0, 0])
    b = np.array([1, 0, 0])
    expected = np.array(-1)
    value = _compute_loss_tensor(a, b, model)
    assert_equal(value, expected)

    logging.debug("  - Identical vectors y: expecting -1")
    a = np.array([0, 1, 0])
    b = np.array([0, 1, 0])
    expected = np.array(-1)
    value = _compute_loss_tensor(a, b, model)
    assert_equal(value, expected)

    logging.debug("  - Identical vectors z: expecting -1")
    a = np.array([0, 0, 1])
    b = np.array([0, 0, 1])
    expected = np.array(-1)
    value = _compute_loss_tensor(a, b, model)
    assert_equal(value, expected)

    logging.debug("  - Vectors with same angle: expecting -1")
    scales = np.random.random(20) * 20
    for s in scales:
        a = np.array([1, 0, 0])
        b = a * s
        expected = np.array(-1)
        value = _compute_loss_tensor(a, b, model)
        assert_equal(value, expected)

    logging.debug("  - Vectors with at 90 degrees 1: expecting 0")
    a = np.array([1, 0, 0])
    b = np.array([0, 1, 0])
    expected = np.array(0)
    value = _compute_loss_tensor(a, b, model)
    assert_equal(value, expected)

    logging.debug("  - Vectors with at 90 degrees 2: expecting 0")
    a = np.array([1, 0, 0])
    b = np.array([0, 0, 1])
    expected = np.array(0)
    value = _compute_loss_tensor(a, b, model)
    assert_equal(value, expected)

    logging.debug("  - Vectors with at 90 degrees random: expecting 0")
    for _ in range(20):
        a = _get_random_vector(3)
        b = _get_random_vector(3)
        c = np.cross(a, b)
        expected = np.array(0)

        value = _compute_loss_tensor(a, c, model)
        assert np.allclose(value, expected, atol=tol), \
            "Failed; got: {}; expected: {}".format(value, expected)

        value = _compute_loss_tensor(b, c, model)
        assert np.allclose(value, expected, atol=tol), \
            "Failed; got: {}; expected: {}".format(value, expected)

    logging.debug("  - Vectors with at 180 degrees random: expecting 1")
    for _ in range(20):
        a = _get_random_vector(3)
        b = np.array(-a * (np.random.random() + 1e-3) *
                     np.random.randint(1, 10),
                     dtype=np.float32)
        expected = np.array(1)

        value = _compute_loss_tensor(a, b, model)
        assert np.allclose(value, expected, atol=tol), \
            "Failed; got: {}; expected: {}".format(value, expected)

    logging.debug("  - Random vectors: comparing with cosine.")
    for _ in range(200):
        a = _get_random_vector(3)
        b = _get_random_vector(3)
        # model outputs -cos(a,b), but cosine computes 1-cos(a,b)
        expected = cosine(a, b) - 1

        value = _compute_loss_tensor(a, b, model)
        assert np.allclose(value, expected, atol=tol), \
            "Failed; got: {}; expected: {}".format(value, expected)
 def similarity(self, emb1, emb2):
     return cosine(emb1, emb2)
    def compute_metrics(self, rendition_frame, next_rendition_frame,
                        reference_frame, next_reference_frame):
        rendition_metrics = {}

        if self.profiling:

            self.evaluate_cross_correlation_instant = self.cpu_profiler(
                self.evaluate_cross_correlation_instant)
            self.evaluate_dct_instant = self.cpu_profiler(
                self.evaluate_dct_instant)
            self.evaluate_entropy_instant = self.cpu_profiler(
                self.evaluate_entropy_instant)
            self.evaluate_lbp_instant = self.cpu_profiler(
                self.evaluate_lbp_instant)
            self.evaluate_difference_canny_instant = self.cpu_profiler(
                self.evaluate_difference_canny_instant)
            self.evaluate_difference_instant = self.cpu_profiler(
                self.evaluate_difference_instant)
            self.evaluate_spatial_complexity = self.cpu_profiler(
                self.evaluate_spatial_complexity)
            self.evaluate_gaussian_instant = self.cpu_profiler(
                self.evaluate_gaussian_instant)
            self.evaluate_gaussian_difference_instant = self.cpu_profiler(
                self.evaluate_gaussian_difference_instant)
            self.evaluate_mse_instant = self.cpu_profiler(
                self.evaluate_mse_instant)
            self.evaluate_psnr_instant = self.cpu_profiler(
                self.evaluate_psnr_instant)
            self.evaluate_ssim_instant = self.cpu_profiler(
                self.evaluate_ssim_instant)
            self.rescale_pair = self.cpu_profiler(self.rescale_pair)

        # Some metrics only need the luminance channel
        reference_frame_gray = reference_frame
        rendition_frame_gray = rendition_frame
        next_rendition_frame_gray = next_rendition_frame

        for metric in self.metrics_list:

            if metric == 'temporal_histogram_distance':
                rendition_metrics[metric] = self.histogram_distance(
                    reference_frame, rendition_frame)

            if metric == 'temporal_difference':
                # Compute the temporal inter frame difference
                rendition_metrics[metric] = self.evaluate_difference_instant(
                    rendition_frame_gray, next_rendition_frame_gray)

            if metric == 'temporal_psnr':
                # Compute the temporal inter frame psnr
                rendition_metrics[metric] = self.evaluate_psnr_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_ssim':
                # Compute the temporal inter frame ssim
                rendition_metrics[metric] = self.evaluate_ssim_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_mse':
                # Compute the temporal inter frame mse
                rendition_metrics[metric] = self.evaluate_mse_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_canny':
                # Compute the temporal inter frame difference of the canny version of the frame
                rendition_metrics[
                    metric] = self.evaluate_difference_canny_instant(
                        reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_cross_correlation':
                rendition_metrics[
                    metric] = self.evaluate_cross_correlation_instant(
                        reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_dct':
                rendition_metrics[metric] = self.evaluate_dct_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_gaussian':
                rendition_metrics[metric] = self.evaluate_gaussian_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_gaussian_difference':
                rendition_metrics[
                    metric] = self.evaluate_gaussian_difference_instant(
                        reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_spatial_complexity':
                rendition_metrics[metric] = self.evaluate_spatial_complexity(
                    reference_frame_gray)

            if metric == 'temporal_entropy':
                rendition_metrics[metric] = self.evaluate_entropy_instant(
                    reference_frame_gray, rendition_frame_gray)

            if metric == 'temporal_lbp':
                rendition_metrics[metric] = self.evaluate_lbp_instant(
                    reference_frame_gray, rendition_frame_gray)

            # Compute the hash of the target frame
            rendition_hash = self.dhash(rendition_frame)
            # Extract the dhash for the reference frame
            reference_hash = self.dhash(reference_frame)

            # Compute different distances with the hash
            if metric == 'hash_euclidean':
                rendition_metrics['hash_euclidean'] = distance.euclidean(
                    reference_hash, rendition_hash)
            if metric == 'hash_hamming':
                rendition_metrics['hash_hamming'] = distance.hamming(
                    reference_hash, rendition_hash)
            if metric == 'hash_cosine':
                rendition_metrics['hash_cosine'] = distance.cosine(
                    reference_hash, rendition_hash)

        return rendition_metrics
Ejemplo n.º 56
0
def build_features(data):

    char_model = gensim.models.KeyedVectors.load_word2vec_format(
        '../data/char_w2v.txt')
    word_model = gensim.models.KeyedVectors.load_word2vec_format(
        '../data/word_w2v.txt')
    X = pd.DataFrame()
    X['word_wmd'] = data.apply(
        lambda x: wmd(x['word1'], x['word2'], word_model), axis=1)
    X['char_wmd'] = data.apply(
        lambda x: wmd(x['char1'], x['char2'], char_model), axis=1)
    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.word1.values)):
        question1_vectors[i, :] = sent2vec(q, word_model)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.word2.values)):
        question2_vectors[i, :] = sent2vec(q, word_model)

    char_question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.char1.values)):
        char_question1_vectors[i, :] = sent2vec(q, char_model)

    char_question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.char2.values)):
        char_question2_vectors[i, :] = sent2vec(q, char_model)
    #
    X['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
    ]

    X['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    X['jaccard_distance'] = [
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
    ]

    X['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                         np.nan_to_num(question2_vectors))
    ]

    X['euclidean_distance'] = [
        euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]

    X['minkowski_distance'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
    ]

    X['braycurtis_distance'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
    ]

    X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    X['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

    X['char_skew_q1vec'] = [
        skew(x) for x in np.nan_to_num(char_question1_vectors)
    ]
    X['char_skew_q2vec'] = [
        skew(x) for x in np.nan_to_num(char_question2_vectors)
    ]
    X['char_kur_q1vec'] = [
        kurtosis(x) for x in np.nan_to_num(char_question1_vectors)
    ]
    X['char_kur_q2vec'] = [
        kurtosis(x) for x in np.nan_to_num(char_question2_vectors)
    ]

    return X
Ejemplo n.º 57
0
 def get_cosine_distance(self, document_1, document_2):
     return 1 - distance.cosine(document_1, document_2)
def calc_cosine(source_representation, test_representation):
    result = 1 - distance.cosine(source_representation[0], test_representation[0])
    print("Cosine Similarity : {}".format(result))
    return result
Ejemplo n.º 59
0
def post(request):
    def getSimilarityScore(history, similarities):
        return sum(history * similarities) / sum(similarities)

    received_json_data = json.loads(request.body, encoding="utf-8-sig")
    received_json_data = received_json_data['data']

    df = pd.DataFrame(received_json_data)
    df = df.fillna(1)
    df = pd.DataFrame(df).astype(int)

    df = pd.pivot_table(df,
                        values='attending',
                        index=['userId'],
                        columns=['eventId'])

    df = df.reset_index()
    df = df.fillna(1)
    data_file = df.drop('userId', 1)
    # Dataframe for item vs. item similarity scores
    data_item_based_similarity = pd.DataFrame(index=data_file.columns,
                                              columns=data_file.columns)

    data_item_based_similarity.reset_index()
    # Lets fill in those empty spaces with cosine similarities
    # Loop through the columns
    for column in range(0, len(data_item_based_similarity.columns)):
        # Loop through the columns for each column
        for row in range(0, len(data_item_based_similarity.columns)):
            # Fill in placeholder with cosine similarities
            data_item_based_similarity.iloc[column, row] = 1 - cosine(
                data_file.iloc[:, column], data_file.iloc[:, row])
    # Create placeholder items for closest neighbours to an item
    data_neighbours = pd.DataFrame(index=data_item_based_similarity.columns,
                                   columns=[range(1, 11)])
    # Loop through our similarity dataframe and fill in neighbouring item names
    for column in range(0, len(data_item_based_similarity.columns)):
        data_neighbours.iloc[column, :10] = data_item_based_similarity.iloc[
            0:, column].order(ascending=False)[:10].index

    # Create a place holder matrix for similarities, and fill in the user name column.
    data_similarity = pd.DataFrame(index=df.index, columns=df.columns)
    data_similarity.iloc[:, :1] = df.iloc[:, :1]
    print(len(data_similarity.index))
    # Loop through all rows, skipping the user column, and fill with similarity scores.
    for column in range(1, len(data_similarity.index)):
        stdout.write("\r%d" % column)
        stdout.flush()
        for row in range(1, len(data_similarity.columns)):
            user = data_similarity.index[column]
            event = data_similarity.columns[row]
            # If an event has already been attended, do not recommend it.
            if df.iloc[column][row] == 1:
                data_similarity.iloc[column][row] = 0
            else:
                event_top_names = data_neighbours.loc[event][1:10]
                event_top_sims = data_item_based_similarity.ix[event].order(
                    ascending=False)[1:10]
                user_purchases = data_file.ix[user, event_top_names]
                data_similarity.iloc[column][row] = getSimilarityScore(
                    user_purchases, event_top_sims)
    # Get the top 6 events for each user and store in a DateFrame.
    data_recommend = pd.DataFrame(
        index=data_similarity.index,
        columns=['userId', '1', '2', '3', '4', '5', '6'])
    data_recommend.iloc[0:, 0] = data_similarity.iloc[:, 0]
    # Instead of top event scores, we want to see eventId numbers so they can be passed back to the app.
    for column in range(0, len(data_similarity.index)):
        data_recommend.iloc[column,
                            1:] = data_similarity.iloc[column, :].order(
                                ascending=False).iloc[1:7, ].index.transpose()
    # Return all recommendations in response to HTTP post to be parsed on the client side.
    print(data_recommend.to_string)
    json_recommend = data_recommend.to_json(orient='index')
    print(json_recommend)
    if json_recommend is not None:
        return JsonResponse(json_recommend, content_type='json', safe=False)
Ejemplo n.º 60
0
def quotation_spec_match(quote_limits, spec_limits):
    quotes_dict = {}
    spec_dict = {}
    for quote in quote_limits:
        data_splits = re.findall(r'(.*)\s(\d+)', quote)[0]
        if data_splits != '':
            quotes_dict.update({data_splits[0]: data_splits[1]})

    for spec in spec_limits:
        data_splits = re.findall(r'(\d+)\s(.*)', spec)[0]
        if data_splits[1] != '':
            spec_dict.update({data_splits[1]: data_splits[0]})

    quotes_df = pd.DataFrame({
        'Description': list(quotes_dict.keys()),
        'Limit_in_Dollars': list(quotes_dict.values())
    })
    spec_df = pd.DataFrame({
        'Description': list(spec_dict.keys()),
        'Limit_in_Dollars': list(spec_dict.values())
    })

    quotes_df_X = quotes_df['Description'].values
    spec_df_X = spec_df['Description'].values

    quotes_df_X_cv = cv.fit_transform(quotes_df_X).toarray()
    spec_df_X_cv = cv.transform(spec_df_X).toarray()

    quotes_final_df = pd.DataFrame(quotes_df_X_cv,
                                   columns=cv.get_feature_names())
    quotes_final_df['Limit_in_Dollars'] = quotes_df['Limit_in_Dollars'].apply(
        pd.to_numeric)
    spec_final_df = pd.DataFrame(spec_df_X_cv, columns=cv.get_feature_names())
    spec_final_df['Limit_in_Dollars'] = spec_df['Limit_in_Dollars'].apply(
        pd.to_numeric)

    matched_quotes = []
    matched_spec = []
    matched_limit = []
    for row_in_quote, i in enumerate(quotes_final_df['Limit_in_Dollars']):
        for row_in_spec, j in enumerate(spec_final_df['Limit_in_Dollars']):
            if i == j:
                cos_val = cosine(quotes_final_df.iloc[row_in_quote, :-1],
                                 spec_final_df.iloc[row_in_spec, :-1])
                if cos_val < 0.55:
                    print("Matched Quotes and Specs are below")
                    print("-" * 40)
                    print(quote_limits[row_in_quote], '|||||',
                          spec_limits[row_in_spec])
                    matched_quotes.append(quote_limits[row_in_quote])
                    matched_spec.append(spec_limits[row_in_spec])
                    matched_limit.append("MATCHED")
                    print("=" * 40)

    unmatched_quotes = [i for i in quote_limits if i not in matched_quotes]
    print("\n\nUnmatched Quotes are")
    print('-' * 40)
    print(unmatched_quotes)

    matched_df = pd.DataFrame({
        "Quotations": matched_quotes,
        "Limits": matched_limit,
        "Specifications": matched_spec
    })
    unmatched_df = pd.DataFrame({
        "Quotations":
        unmatched_quotes,
        "Limits": ["NOT MATCHED" for i in range(len(unmatched_quotes))]
    })
    df = pd.concat([matched_df, unmatched_df], axis=0)

    return df