def contextual_full_lijing(s, T, delta=0.9, lamb=0.1): assert not s.cascade theta = np.zeros(s.d) beta = 0 V = lamb * np.eye(s.d) b = np.zeros(s.d) reward = [0] regret = [0] for t in range(1, T): x, params = s.new() U = {arm: theta.dot(x[arm]) + beta * x[arm].dot(np.linalg.inv(V)).dot(x[arm]) for arm in x} recc = s.oracle(U, *params) r, ctr, z = s.play(recc) V += sum([(s.gamma) ** (2 * k) * np.outer(x[recc[k]], x[recc[k]]) for k in range(len(recc))]) b += sum([ctr[k] * x[recc[k]] for k in range(len(recc))]) theta = np.linalg.inv(V).dot(b) beta = np.sqrt(s.d * np.log((1 + t * s.L) / delta)) + np.sqrt(lamb) reward.append(reward[-1] + r) regret.append(regret[-1] + z) logger.info("Lijing play reward {0}/{1}".format(reward[-1], T)) if s.theta is not None: logger.info("theta cosine similarity {0}".format(1 - cosine(s.theta, theta))) similarity = 1 - cosine(s.theta, theta) else: similarity = None if s.regret is True: logger.info("regret {0}/{1}".format(regret[-1], T)) return reward, regret, similarity
def similarity(arg1, test_infile): with smart_open(test_infile, "r") as f: test = json.load(f) gold = np.array([float(x[2]) for x in test]) # we're given a tuple: matrix-vector composition if isinstance(arg1,tuple): if len(arg1) == 2: lf = arg1[0] emb = arg1[1] # agumented matrices if lf.A.shape[2] == lf.A.shape[1]+1: ours = np.array([1-cosine( np.dot(lf.word(x[0][0]),np.hstack((emb.word(x[0][1]),[1]))), np.dot(lf.word(x[1][0]),np.hstack((emb.word(x[1][1]),[1])))) for x in test]) # standard matrices else: ours = np.array([1-cosine( np.dot(lf.word(x[0][0]),emb.word(x[0][1])), np.dot(lf.word(x[1][0]),emb.word(x[1][1]))) for x in test]) return spearmanr(gold,ours) return TypeError("Invalid input format") # we're only given embeddings: do cosine similarity of vectors elif isinstance(arg1,Embeddings): ours = np.array([1-cosine(arg1.word(x[0]),arg1.word(x[1])) for x in test]) return spearmanr(gold,ours) return TypeError("Invalid input format")
def compute_distance(query_channel, channel, mean_vec, distance_type = 'eucos'): """ Compute the specified distance type between chanels of mean vector and query image. In caffe library, FC8 layer consists of 10 channels. Here, we compute distance of distance of each channel (from query image) with respective channel of Mean Activation Vector. In the paper, we considered a hybrid distance eucos which combines euclidean and cosine distance for bouding open space. Alternatively, other distances such as euclidean or cosine can also be used. Input: -------- query_channel: Particular FC8 channel of query image channel: channel number under consideration mean_vec: mean activation vector Output: -------- query_distance : Distance between respective channels """ if distance_type == 'eucos': query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. + spd.cosine(mean_vec[channel, :], query_channel) elif distance_type == 'euclidean': query_distance = spd.euclidean(mean_vec[channel, :], query_channel)/200. elif distance_type == 'cosine': query_distance = spd.cosine(mean_vec[channel, :], query_channel) else: print "distance type not known: enter either of eucos, euclidean or cosine" return query_distance
def main(directory, inputfile_name, output_filename): semitones = [] noteList = [[0 for x in range(12)] for x in range(6)] collect = audio.AudioQuantumList() final = audio.AudioQuantumList() #createAllNotes() initNoteList(noteList) print len(noteList) print noteList[0][0].analysis.segments.timbre audiofile = audio.LocalAudioFile(input_filename) songSegments = audiofile.analysis.segments bmp = 10000.0 bmpi = 0 bmt = 10000.0 bmti = 0 #print len(songSegments) for i in range(len(songSegments)): for j in range(12): noteSegments = noteList[0][j].analysis.segments pDist = distFinder.cosine(songSegments[i].pitches, noteSegments[len(noteSegments) / 2].pitches) if pDist < bmp: bmp = pDist bmpi = j for k in range(6): noteSegments = noteList[k][bmpi].analysis.segments tDist = distFinder.cosine(songSegments[i].timbre[1], noteSegments[len(noteSegments) / 2].timbre[1]) if tDist < bmt: bmt = tDist bmti = k print str(i / len(songSegments)) + '%' matchDuration(noteList[bmti][bmpi].analysis.segments, songSegments[i], collect) bmp = 10000.0 bmt = 10000.0 out = audio.assemble(collect) out.encode(output_filename)
def contextual_cascading_gsherry(s, T, delta=0.9, lamb=0.1, gamma=None): assert gamma assert s.cascade theta = np.zeros(s.d) beta = 0 V = lamb * np.eye(s.d) ldV = np.linalg.slogdet(V)[1] X = np.zeros((1, s.d)) Y = np.zeros(1) reward = [0] regret = [0] timestamp = time.time() for t in range(1, T): x, params = s.new() U = {arm: theta.dot(x[arm]) + beta * x[arm].dot(np.linalg.inv(V)).dot(x[arm]) for arm in x} recc = s.oracle(U, *params[:-1], gamma) r, c, z = s.play(recc) V += sum([gamma ** (2 * k) * np.outer(x[recc[k]], x[recc[k]]) for k in range(min(len(recc), c + 1))]) X = np.concatenate([X] + [gamma ** k * x[recc[k]].reshape(1, s.d) for k in range(min(len(recc), c + 1))]) Y = np.concatenate([Y] + [gamma ** k * ((k == c) == s.disj) * np.ones(1) for k in range(min(len(recc), c + 1))]) theta = np.linalg.inv(X.T.dot(X) + lamb * np.eye(s.d)).dot(X.T.dot(Y)) beta = np.sqrt(np.linalg.slogdet(V)[1] - ldV - 2 * np.log(delta)) + np.sqrt(lamb) reward.append(reward[-1] + r) regret.append(regret[-1] + z) logger.info("Sherry play reward {0}/{1}, gamma={2}".format(reward[-1], T, gamma)) if s.theta is not None: logger.info("theta cosine similarity {0}".format(1 - cosine(s.theta, theta))) similarity = 1 - cosine(s.theta, theta) else: similarity = None if s.regret is True: logger.info("regret {0}/{1}".format(regret[-1], T)) return reward, regret, similarity
def doc_similarity(self): sim_matrix = [[0 for _ in xrange(self.D.shape[1])] for _ in xrange(self.D.shape[1])] for row, vec1 in enumerate(self.D): for col, vec2 in enumerate(self.D): print cosine(vec1, vec2) sim_matrix[row][col] = cosine(vec1, vec2) return sim_matrix
def print_transform_similarities(dictionary, transform_mtx, mtx_from, mtx_to): tr_tr = transform_mtx.transpose() for i, word in enumerate(dictionary): src = mtx_from[i] tgt = mtx_to[i] dist1 = 1 - cosine(transform_mtx.dot(src), tgt) dist2 = 1 - cosine(tr_tr.dot(tgt), src) print(u'{0}\t{1}\t{2}'.format(word, dist1, dist2).encode('utf8'))
def least_cos_dist(sent_emb, mark2ee): pred = 'marker_0' dist = 1. for key, value in mark2ee.iteritems(): if cosine(sent_emb, value) < dist: pred = key dist = cosine(sent_emb, value) return pred
def weight_cost_func_bow_topics(self, s, t): a1 = numpy.array(s['bow'].todense()).ravel() a2 = numpy.array(t['bow'].todense()).ravel() if not a1.any() or not a2.any(): d = 1 else: d = cosine(a1, a2) return (0.2 * cosine(s['topics'], t['topics']) + 0.8 * d)
def testCosDistance(self): n1 = numpy.array([[1., 2., 3., 4.], [1., 1., 1., 1.]], dtype=numpy.float32) n2 = numpy.array([[5., 6., 7., -8.], [1., 1., 1., 1.]], dtype=numpy.float32) out = self.Run(functions.cos_distance(n1, n2)) testing.assert_allclose( out[0], numpy.array( [distance.cosine(n1[0], n2[0]), distance.cosine(n1[1], n2[1])]), rtol=TOLERANCE)
def compute_similarity(self,user1, user2): ''' given two users and compute their distance ''' h1_sim=1-cosine(user1['h1'],user2['h1']) h2_sim=1-cosine(user1['h2'],user2['h2']) h3_sim=1-cosine(user1['h3'],user2['h3']) return h1_sim,h2_sim,h3_sim pass
def testCosDistanceWithBroadcast(self): n1 = numpy.array([[[1., 2., 3., 4.], [1., 1., 1., 1.]], [[5., 6., 7., 8.], [1., 1., 1., 2.]]], dtype=numpy.float32) n2 = numpy.array([[5., 6., 7., -8.], [1., 1., 1., 1.]], dtype=numpy.float32) out = self.Run(functions.cos_distance(n1, n2)) expected = numpy.array( [[distance.cosine(n1[0, 0], n2[0]), distance.cosine(n1[0, 1], n2[1])], [distance.cosine(n1[1, 0], n2[0]), distance.cosine(n1[1, 1], n2[1])]]) testing.assert_allclose(expected, out[0], atol=TOLERANCE)
def triplet(w1, w2, w3): try: sim12 = 1 - cosine(myembed[word2idx[w1], :], myembed[word2idx[w2], :]) sim13 = 1 - cosine(myembed[word2idx[w1], :], myembed[word2idx[w3], :]) except KeyError: print "Word %s is not present in Vocablury" % sys.exc_value return print "Similarity between " + w1 + " and " + w2 + " --> " + str(sim12) print "Similarity between " + w1 + " and " + w3 + " --> " + str(sim13)
def append_que(que, p_xws, p_d, n_xws, n_ds, xf, xt, debug, model): for xw, d in zip(n_xws, n_ds): que.append((p_xws + [xw], p_d + d)) if debug: for x in p_xws + [xw]: print(u'{} '.format(x), end='') df = ssd.cosine(xf, model[xw]) dt = ssd.cosine(xt, model[xw]) print(u'\t d:{0:.3f} (di:{1:.3f}, dt:{2:.3f})'. format(p_d + d, df, dt)) return que
def simplePredictFor2(review, lexicons): feature0 = loadFeature(FEATURE0) feature4 = loadFeature(FEATURE4) feature = feature0+feature4 vector = getReviewVector(review, feature, lexicons) distances = {} d0 = distance.cosine(vectors[0],vector) distances[d0] = 0 d4 = distance.cosine(vectors[4],vector) distances[d4] = 1 d = min(d0,d4) return distances[d]
def compTriplet(w1, w2, w3): embed = loadmat('embedding.txt') vocab = load_word2idx('word2idx.json') try: sim12 = 1 - cosine(embed[vocab[w1], :], embed[vocab[w2], :]) sim13 = 1 - cosine(embed[vocab[w1], :], embed[vocab[w3], :]) except KeyError: print "Word %s is not present in Vocablury" % sys.exc_value exit() print "Similarity between " + w1 + " and " + w2 + " --> " + str(sim12) print "Similarity between " + w1 + " and " + w3 + " --> " + str(sim13)
def test_multidimensional_dtw_cosine(self): a = np.array([[1,2,3, np.nan], [7,8,9,np.nan]]).T b = np.array([[10,12,14], [13,15,17]]).T # DTW should match the points: # (1,7) to (10,13) # (2,8) to (12,15) # (3,9) to (14,17) # cosine_distance = cosine(a[0], b[0]) + cosine(a[1], b[1]) + cosine(a[2], b[2]) self.assertAlmostEqual(cosine_distance, dtw_std(a, b, metric='cosine'))
def add(tIdx, sIdx, contexts): cos = cosine(wvecs[tIdx], wvecs[sIdx]) ncontexts = 0 for context in contexts: if context in c2vec: cIdx = c2vec[context] cos += cosine(cvecs[cIdx], wvecs[sIdx]) ncontexts += 1 cos = (cos/(ncontexts + 1.0)) return cos
def np_loss_cos_cos2(x1, x2, y): assert x1.shape[0] == x2.shape[0] == y.shape[0] losses = [] for i in xrange(x1.shape[0]): if y[i] == 1: # Data points are the same, use cosine distance loss = distance.cosine(x1[i], x2[i]) / 2. losses.append(loss) elif y[i] == 0: # Data points are different, use cosine similarity squared loss = (distance.cosine(x1[i], x2[i]) - 1)**2 losses.append(loss) else: assert False return numpy.mean(losses)
def rerank(self, cands, t): vt = None try: vt = self.model[t] vt = self.selector.transform(vt) except Exception: return cands fcands = [] vcands = [] for cand in cands: try: vec = self.model[cand] vec = self.selector.transform(vec) dist = cosine(vt, vec) fcands.append(cand) vcands.append(dist) except Exception: pass distmap = {} for i in range(0, len(fcands)): c = fcands[i] d = vcands[i] distmap[c] = d ranking_data = sorted(distmap.keys(), key=distmap.__getitem__) return ranking_data
def similarity(word_list, id2row): test_file = 'word-test.v1.txt' test_list = [] for word, vector in zip(id2row, word_list): word_dict[word] = vector with open(os.path.join('../corpus', test_file)) as f: for line in f: abcd = line.strip().split() test_list.append(abcd) total = len(test_list) right = 0 for test in test_list: vec1 = test[0] vec2 = test[1] vec3 = test[2] vec4 = test[3] vec_real = vec1 - vec2 + vec3 dist = sys.maxsize vec_cand = [] for word in word_list: vector = word_dict[word] curr_dist = cosine(vec_cand, vector) if curr_dist < dist: dist = curr_dist vec_cand = vector if np.array_equal(vec_cand, vec_real): right += 1 print(right)
def get_dis_corr(self): if len(self.oracle_sim) != len(self.gen_sim): raise ArithmeticError corr = 0 for index in range(len(self.oracle_sim)): corr += (1 - cosine(np.array(self.oracle_sim[index]), np.array(self.gen_sim[index]))) return np.log10(corr / len(self.oracle_sim))
def average_cosine_distance(user_hash, coupon_vector, train_coupon_hash_to_vector_dict, user_hash_to_train_coupon_list, user_buy_and_view, user_hash_to_day_probability): if user_hash not in user_hash_to_train_coupon_list: train_coupon_list = [] else: train_coupon_list = user_hash_to_train_coupon_list[user_hash] if len(train_coupon_list) == 0: return 1.0 sum_cosine_distance = 0.0 train_coupon_list = user_hash_to_train_coupon_list[user_hash] arr = numpy.array([0] * len(train_coupon_hash_to_vector_dict[train_coupon_list[0]])) # i = 0 # while i < len(train_coupon_list): # vec = train_coupon_hash_to_vector_dict[train_coupon_list[i]] # i += 1 # arr = arr + log10(1 + train_coupon_list[i]) * numpy.array(vec) # i += 1 for i in range(0, len(train_coupon_list)): if user_hash in user_buy_and_view and train_coupon_list[i] in user_buy_and_view[user_hash]: view = user_buy_and_view[user_hash][train_coupon_list[i]]['view'] # buy = user_buy_and_view[user_hash][train_coupon_list[i]]['buy'] arr = arr + user_hash_to_day_probability[user_hash][train_coupon_list[i]] * view * numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]]) # else: # arr = arr + numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]]) # arr = arr + numpy.array(train_coupon_hash_to_vector_dict[train_coupon_list[i]]) arr = arr / float(len(train_coupon_list)) return cosine(arr, coupon_vector)
def similarity_matrix(self,transpose=False): """ computes the similarity matrix using cosine similarity IN transpose (bool) default=False, whether to transpose the adjacency matrix determines similarity between customers or products OUT matrix of similarity scores """ if transpose: # take the transpose of A (see numpy documentation) A = self.A.T # dimension of the matrix D dim = self.n else: # dimension of the matrix D A = self.A.copy() dim = self.m print('start time') print(time.ctime()) print('computing...') start = time.time() # lil_matrix allows us to efficiently store data in memory (see scipy documentation) D = lil_matrix((dim,dim)) for i in xrange(dim): if i % 1000 == 0: print(float(i)/dim) for j in xrange(dim): # compute cosine similarity between row i and column j D[i,j] = -1*(cosine(A[i],A[j]) - 1) # convert D to a dense (rather than sparse) matrix self.D = D.todense() end = time.time() t = end-start print('finished in %d seconds') % t
def __init__(self, data, card_names): all_dists = [] dists_per_card = [] N = len(data) for i in range(N): dists_for_i = [] for j in range(N): if i != j: dist = distance.cosine(data[i], data[j]) dists_for_i.append((dist, card_names[j])) all_dists.append(dist) dists_for_i.sort() dists_per_card.append(dists_for_i) # print card_names[i], ':', ', '.join( # [n for (d, n) in dists_for_i][:10]) all_dists.sort() self.interesting_quantiles = [0.005, 0.01, 0.02, 1.0] self.interesting_dist_cutoffs = [all_dists[int(q * N)] for q in self.interesting_quantiles] self.dists_per_card = dists_per_card self.card_names = card_names self.partitions_by_card = {} for card_name, dists_list in zip(self.card_names, self.dists_per_card): card_partitions = [list() for i in xrange(len(self.interesting_quantiles))] for dist, ocard in dists_list: for idx, quant in enumerate(self.interesting_quantiles): if dist < quant: card_partitions[idx].append(ocard) self.partitions_by_card[card_name] = card_partitions
def calc(self): from numpy import append from scipy.spatial.distance import cosine if self.sim_type == 'lda_cosine': for i in range(len(self.key_list)-1): for j in range(i+1, len(self.key_list)): d = cosine(self.corpus[self.key_list[i]], self.corpus[self.key_list[j]]) out_tag = self.key_list[i] + '_' + self.key_list[j] self.calculated[out_tag] = d self.raw[out_tag] = append(self.corpus[self.key_list[i]],self.corpus[self.key_list[j]]) elif self.sim_type == 'word2vec_cosine': elif sim_type == 'kernel': from PyML import sequenceData docs = [self.corpus[key] for key in sorted(self.key_list)] kernel = sequenceData.spectrum_data(docs, k) mat = kernel.getKernelMatrix() for i in range(len(docs)): for j in range(i+1, len(docs)): tag = self.key_list[i] + '_' + self.key_list[j] self.calculated[tag] = mat[i][j] else: raise KeyError('Please check your similarity type!')
def calc_dist(e1,e2,mode=1): if mode == 1: return ssd.euclidean(e1,e2) elif mode == 2: return ssd.cityblock(e1,e2) elif mode == 3: return ssd.cosine(e1,e2)
def _cosine(self, word, n=10): """ Test method for cosine distance using `scipy.distance.cosine` Note: This method is **a lot** slower than `self.cosine` and results are the almost the same, you should be using `self.cosine` Requires: `__init__(..., save_memory=False)` Parameters ---------- word : string word in the vocabulary to calculate the vectors n : int, optional (default 10) number of neighbors to return """ from scipy.spatial import distance target_vec = self[word] metric = np.empty(self.vocab.shape) for idx, vector in enumerate(self.vectors): metric[idx] = distance.cosine(target_vec, vector) best = metric.argsort()[1:n + 1] return self.generate_response(best, metric)
def get(self, height, weight, city, state): # combined = pd.read_csv("wonderfullyamazingdata.csv", encoding='ISO-8859-1') combined = pd.read_csv("newamazingdata.csv", encoding='ISO-8859-1') location = str(city) + ' ' + str(state) geolocator = Nominatim() place = geolocator.geocode(location[0]) latitude = place.latitude longitude = place.longitude users = [float(height), float(weight), latitude, longitude ] players = combined[["height", "weight", "latitude", "longitude"]] result = [] for index in range(0,len(players)): result.append(1-distance.cosine(users, players.iloc[index])) result = sorted(range(len(result)), key=lambda i: result[i])[-5:] result.reverse() ids = [] for index in result: ids.append( combined.ID.iloc[index] ) ids = str(ids) with open('reply.json', 'w') as outfile: json_stuff = json.dumps(ids) json.dump(json_stuff, outfile) return json_stuff
def get_nearest_words(target_vec, k=20): """Summary Parameters ---------- target_vec : TYPE Description k : int, optional Description Returns ------- TYPE Description """ # Get distances to target vector dists = [] for vec_i in wordvecs: dists.append(distance.cosine(target_vec, vec_i)) # Get top nearest words idxs = np.argsort(dists) res = [] for idx_i in idxs[:k]: res.append((words[idx_i], dists[idx_i])) return res
def create_user_version(tA, tB): user_A_ratings = random.randrange( 0, 101) # The number of movies this version of user A rated user_B_ratings = random.randrange( 0, 101) # The number of movies this version of user B rated A_indexes = [] # The indexes of the movies for A B_indexes = [] # The indexes of the movies for B A_version = [] # This version of A B_version = [] # This version of B for m in range(user_A_ratings): # Get a random rating from A temp_index = random.randrange(0, len(tA)) while (temp_index in A_indexes): temp_index = random.randrange(0, len(tA)) A_indexes.append(temp_index) A_version.append(tA[temp_index]) # Fill the list of this version for m in range(user_B_ratings): # Get a random rating from B temp_index = random.randrange(0, len(tB)) while (temp_index in B_indexes): temp_index = random.randrange(0, len(tB)) B_indexes.append(temp_index) B_version.append(tB[temp_index]) # Fill the list of this version intersection_count = list( set(A_indexes) & set(B_indexes) ) # How many items belong to the intersection = number of common indexes intersection_elements = set(A_indexes) & set( B_indexes) # Common element indexes intersections.append(len(intersection_count)) A_common = [] B_common = [] for x in range( len(intersection_elements)): # Create the lists of common elements tp = intersection_elements.pop() A_common.append(tA[tp]) B_common.append(tB[tp]) # Here we compute Pearson Correlation A_version_adj = list(A_common) B_version_adj = list(B_common) Pearsons.append( pearsonr(A_common, B_common) [0]) # this uses only the common elements without the mean subtraction # Here we compute the Adjusted Cosine Similarity full_tA = [] full_tB = [] for y in range(len(tA)): # Create full tables if ((y in A_indexes) and (y in B_indexes)): full_tA.append(tA[y]) full_tB.append(tB[y]) if ((y in A_indexes) and (y not in B_indexes)): full_tA.append(tA[y]) full_tB.append(0) if ((y not in A_indexes) and (y in B_indexes)): full_tA.append(0) full_tB.append(tB[y]) for i in range( len(full_tA)): # Adjusting vectors by subtracting their means full_tA[i] = float(full_tA[i]) - float(mean(full_tA)) for i in range( len(full_tB)): # Adjusting vectors by subtracting their means full_tB[i] = float(full_tB[i]) - float(mean(full_tB)) Cosines.append(1 - cosine(full_tA, full_tB))
def cosine((x, y)): return distance.cosine(x, y)
# Drop any column named "user" data_item_base = data.drop('user', 1) data = data.drop('user', 1) print(data_item_base.ix[:, 0:10]) # store DataFrame data_item_base_frame = pd.DataFrame(index=data_item_base.columns, columns=data_item_base.columns) print(data_item_base_frame.head(6).ix[:, 0:6]) # Calculate similarily for i in range(0, len(data_item_base_frame.columns)): # Loop through the columns for each column for j in range(0, len(data_item_base_frame.columns)): # Calculate similarity # print (i , " and ", j) data_item_base_frame.ix[i, j] = 1 - cosine(data.ix[:, i], data.ix[:, j]) data_item_base_frame.to_csv('data_item_base_frame.csv') # data_item_base_frame = pd.read_csv('data_item_base_frame.csv') print(data_item_base_frame.ix[:, 0:5]) # Initial a frame for save closes neighbors to an item data_neighbors = pd.DataFrame(index=data_item_base_frame.columns, columns=range(1, 6)) # Order by similarity for i in range(0, len(data_item_base_frame.columns)): data_neighbors.ix[i, :5] = data_item_base_frame.ix[0:, i].sort_values( ascending=False)[:5].index data_neighbors.ix[:, 0:5].to_csv('dataresult.csv')
def explore_embedding_space(embedding_fn: str, out_fn: str, num_samples=1000) -> None: """ Calculate the following statistics for each layer of the model: 1. mean cosine similarity between a sentence and its words 2. mean dot product between a sentence and its words 3. mean word embedding norm 4. mean cosine similarity between randomly sampled words 5. mean dot product between randomly sampled words 6. mean variance explained by first principal component for a random sample of words num_samples sentences/words are used to estimate each of these metrics. We randomly sample words by first uniformly randomly sampling sentences and then uniformly randomly sampling a single word from each sampled sentence. This is because: - When we say we are interested in the similarity between random words, what we really mean is the similarity between random _word occurrences_ (since each word has a unique vector based on its context). - By explicitly sampling from different contexts, we avoid running into cases where two words are similar due to sharing the same context. Create a dictionary mapping each layer to a dictionary of the statistics write it to out_fn. """ f = h5py.File(embedding_fn, 'r') num_layers = f["0"].shape[0] num_sentences = len(f) sentence_indices = random.sample(list(range(num_sentences)), num_samples) mean_cos_sim_between_sent_and_words = { f'layer_{layer}': [] for layer in range(num_layers) } mean_cos_sim_across_words = { f'layer_{layer}': -1 for layer in range(num_layers) } word_norm_std = {f'layer_{layer}': -1 for layer in range(num_layers)} word_norm_mean = {f'layer_{layer}': -1 for layer in range(num_layers)} variance_explained_random = { f'layer_{layer}': -1 for layer in range(num_layers) } for layer in Tqdm.tqdm(range(num_layers)): word_vectors = [] word_norms = [] mean_cos_sims = [] mean_dot_products = [] for sent_index in sentence_indices: # average word vectors to get sentence vector sentence_vector = f[str(sent_index)][layer].mean(axis=0) num_words = f[str(sent_index)].shape[1] # randomly add a word vector (not all of them, because that would bias towards longer sentences) word_vectors.append( f[str(sent_index)][layer, random.choice(list(range(num_words)))]) # what is the mean cosine similarity between the sentence and its words? mean_cos_sim = np.nanmean([ 1 - cosine(f[str(sent_index)][layer, i], sentence_vector) for i in range(num_words) if f[str(sent_index)][layer, i].shape != () ]) mean_cos_sims.append(round(mean_cos_sim, 3)) # what is the mean embedding norm across words? word_norms.extend([ np.linalg.norm(f[str(sent_index)][layer, i]) for i in range(num_words) ]) mean_cos_sim_between_sent_and_words[f'layer_{layer}'] = round( float(np.mean(mean_cos_sims)), 3) mean_cos_sim_across_words[f'layer_{layer}'] = round( np.nanmean([ 1 - cosine(random.choice(word_vectors), random.choice(word_vectors)) for _ in range(num_samples) ]), 3) word_norm_std[f'layer_{layer}'] = round(float(np.std(word_norms)), 3) word_norm_mean[f'layer_{layer}'] = round(float(np.mean(word_norms)), 3) # how much of the variance in randomly chosen words can be explained by their first principal component? pca = TruncatedSVD(n_components=100) pca.fit(word_vectors) variance_explained_random[f'layer_{layer}'] = min( 1.0, round(float(pca.explained_variance_ratio_[0]), 3)) json.dump( { 'mean cosine similarity between sentence and words': mean_cos_sim_between_sent_and_words, 'mean cosine similarity across words': mean_cos_sim_across_words, 'word norm std': word_norm_std, 'word norm mean': word_norm_mean, 'variance explained for random words': variance_explained_random }, open(out_fn, 'w'), indent=1)
def embedding_average(ref, hypo): ref_avg = np.sum(ref, axis=0) * 1.0 / ref.shape[1] hypo_avg = np.sum(hypo, axis=0) * 1.0 / hypo.shape[1] return 1 - cosine(ref_avg, hypo_avg)
def strangeness_cosine(x, samples): omega_hist = representative_sample(samples) return cosine(x, omega_hist)
def cos_distance(self, X, Y): assert (len(X.shape) == 1) assert (len(X) == len(Y[0])) return [cosine(X, v) for v in Y]
def __calc_distances__(self, v1s, v2s, is_sparse=True): if is_sparse: dcosine = np.array([ cosine(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) dcityblock = np.array([ cityblock(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) dcanberra = np.array([ canberra(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) deuclidean = np.array([ euclidean(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) dminkowski = np.array([ minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) dbraycurtis = np.array([ braycurtis(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s) ]).reshape((-1, 1)) dskew_q1 = [skew(x.toarray().ravel()) for x in v1s] dskew_q2 = [skew(x.toarray().ravel()) for x in v2s] dkur_q1 = [kurtosis(x.toarray().ravel()) for x in v1s] dkur_q2 = [kurtosis(x.toarray().ravel()) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1, 1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape( (-1, 1)) else: dcosine = np.array([cosine(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1, 1)) dcityblock = np.array( [cityblock(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1, 1)) dcanberra = np.array([canberra(x, y) for (x, y) in zip(v1s, v2s)]).reshape( (-1, 1)) deuclidean = np.array( [euclidean(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1, 1)) dminkowski = np.array( [minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape( (-1, 1)) dbraycurtis = np.array( [braycurtis(x, y) for (x, y) in zip(v1s, v2s)]).reshape( (-1, 1)) dskew_q1 = [skew(x) for x in v1s] dskew_q2 = [skew(x) for x in v2s] dkur_q1 = [kurtosis(x) for x in v1s] dkur_q2 = [kurtosis(x) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1, 1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape( (-1, 1)) return np.hstack((dcosine, dcityblock, dcanberra, deuclidean, dminkowski, dbraycurtis, dskew_diff, dkur_diff))
bK1 = bK.split('\t') for x in bK1: bKs.append( float(x)) #list of all the 'k'mer counts for bacteria # d1.append(float(distance.euclidean(vKs,bKs))) # d2.append(float(distance.braycurtis(vKs,bKs))) # d3.append(float(distance.correlation(vKs,bKs))) # d4.append(float(distance.cityblock(vKs,bKs))) # d5.append(float(distance.chebyshev(vKs,bKs))) # d6.append(float(distance.cosine(vKs,bKs))) d1.append(distance.euclidean(vKs, bKs)) d2.append(distance.braycurtis(vKs, bKs)) d3.append(distance.correlation(vKs, bKs)) d4.append(distance.cityblock(vKs, bKs)) d5.append(distance.chebyshev(vKs, bKs)) d6.append(distance.cosine(vKs, bKs)) #hamming distance is only useful if we make them boolean #d7.append(float(distance.hamming(vKs1,bKs1))) #Z = numpy.vstack([vKs,bKs]) #A=numpy.cov(Z, rowvar=False) #B=inv(A) #d7.append(float(distance.mahalanobis(vKs,bKs,B))) bKs = [] vKs = [] #bKs1=[] #vKs1=[] nexd1 = '\t'.join(str(v) for v in d1) nexd2 = '\t'.join(str(v) for v in d2) nexd3 = '\t'.join(str(v) for v in d3) nexd4 = '\t'.join(str(v) for v in d4) nexd5 = '\t'.join(str(v) for v in d5)
#testing cosine similarity , use a random normal distribution cosSimTest = np.zeros((numSamplesToDraw,1)) for i in range(numSamplesToDraw): cosSimTest[i] = 1-dist.cosine(np.random.normal(size=X_hat.shape[2]), np.random.normal(size=X_hat.shape[2])) ''' #Cos Sim Within Same Objs numSamplesToDraw=200 distWithinObjs = np.zeros((numSamplesToDraw,1)) sampleObjIDs_1 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True) sampleTransIDs_1 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True) sampleTransIDs_2 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True) sampleTransIDs_1 = sampleTransIDs_1+5 sampleTransIDs_2 = sampleTransIDs_2+5 for i in range(numSamplesToDraw): distWithinObjs[i] = 1-dist.cosine(X_hat[sampleObjIDs_1[i],sampleTransIDs_1[i],:],X_hat[sampleObjIDs_1[i],sampleTransIDs_2[i],:]) sampledMean_within = np.mean(distWithinObjs) #Cosine Similarity Across Objects distBetweenObjs = np.zeros((numSamplesToDraw,1)) sampleObjIDs_1 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True) sampleObjIDs_2 = np.random.choice(200,size=(numSamplesToDraw,1),replace=True) sampleTransIDs_1 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True) sampleTransIDs_2 = np.random.choice((nt-5),size=(numSamplesToDraw,1),replace=True) sampleTransIDs_1 = sampleTransIDs_1+5 sampleTransIDs_2 = sampleTransIDs_2+5 for i in range(numSamplesToDraw): distBetweenObjs[i] = 1-dist.cosine(X_hat[sampleObjIDs_1[i],sampleTransIDs_1[i],:],X_hat[sampleObjIDs_2[i],sampleTransIDs_2[i],:]) sampledMean_between=np.mean(distBetweenObjs)
import numpy lignes = eval(open('lines.txt', 'r').read()) colonnes = eval(open('colonnes.txt', 'r').read()) nblignes = len(lignes.values()) nbcolonnes = 1471 matrix = numpy.zeros((nblignes, nbcolonnes)) for fichier, x in lignes.items(): f = open("CorpusPreproc/" + fichier, encoding="utf-8").read() for mot, y in colonnes.items(): nb = f.count(mot) matrix[int(x) - 1][int(y) - 1] = nb from scipy.spatial import distance print(distance.cosine(matrix[5, :], matrix[6, :]))
def get_similarity(repr1, repr2): return 1 - cosine(repr1, repr2)
import pickle from scipy.spatial.distance import cosine # Loads the dictionary of prepositions and their vectors from file, as saved in the Jupyter Notebook. with open('11d_vecs.pkl', 'rb') as loadfile: prep_bbox_11d_vecs = pickle.load(loadfile) prepositions = ['over', 'above', 'below', 'under'] # Gets all cosine distance scores for each preposition and saves the list of scores to file. # This was applied to the three first prepositions (above, below, over), but the last (under) was too large and required its own workaround script. for p in prep_bbox_11d_vecs: vecs = prep_bbox_11d_vecs[p] prep_cos = [ cosine(u, v) for i, u in enumerate(vecs) for j, v in enumerate(vecs) if i > j ] lst_filename = p + "_list.pkl" with open(lst_filename, 'wb') as outfile: pickle.dump(prep_cos, outfile, pickle.HIGHEST_PROTOCOL) # Progress indication print, with example results. print("Done with:", p)
def cosine_distance(x1, x2): return distance.cosine(x1, x2)
def get_sentence_cos(original_text, title): word2vec = get_word2vec(word2vec_path) stop_words = get_stop_words(stopwords_path) sentences = split_sentence(original_text) if sentences == []: raise NameError sentences_cut = [cut(''.join(token(n))) for n in sentences] sentences_cut_del_stopwords = [] is_title = False # 处理标题 if title: title_cut = [cut(''.join(token(title)))] words = title_cut[0].split() title_cut_del_stopwords = list(set(words) - set(stop_words)) if title_cut_del_stopwords != []: is_title = True for s in sentences_cut: words = s.split() sentence_cut_del_stopwords = list(set(words) - set(stop_words)) if sentence_cut_del_stopwords != []: sentences_cut_del_stopwords.append(sentence_cut_del_stopwords) if sentence_cut_del_stopwords == []: raise NameError # 得到文章向量,句子向量 sentences_vec = [] additional_wordvec = {} text_vec = np.zeros_like(word2vec.wv['测试']) for i, sentence in enumerate(sentences_cut_del_stopwords): sentence_vec = np.zeros_like(word2vec.wv['测试']) for word in sentence: if word in word2vec.wv.vocab: sentence_vec += word2vec.wv[word] elif word in additional_wordvec: sentence_vec += additional_wordvec[word] else: additional_wordvec[word] = np.random.random( word2vec.wv['测试'].shape) sentence_vec = sentence_vec / len(sentence) # 第一句话比较重要,说三遍 if i == 0: text_vec += sentence_vec * 3 else: text_vec += sentence_vec sentences_vec.append(sentence_vec) if is_title: title_vec = np.zeros_like(word2vec.wv['测试']) for word in title_cut_del_stopwords: if word in word2vec.wv.vocab: title_vec += word2vec.wv[word] elif word in additional_wordvec: title_vec += additional_wordvec[word] else: additional_wordvec[word] = np.random.random( word2vec.wv['测试'].shape) text_vec += text_vec * 3 # 多加了两遍第一句话,三遍title text_vec /= len(sentences) + 5 # 求句子向量与文章向量的cosine sentences_cos = {} for i, sentence_vec in enumerate(sentences_vec): sentences_cos[i] = cosine(sentence_vec, text_vec) return sentences, sentences_cos
ch_n.head() # In[28]: # Chebyshev distance, standardized_dataset ch_s = standardized_dataset.apply(lambda x: chebyshev(x, P_std), axis=1) ch_s.head() # In[29]: # Cosine Distance, raw_dataset cos_r = raw_dataset.apply(lambda x: cosine(x, P_raw), axis=1) cos_r.head() # In[30]: # Cosine Distance, normalized_dataset cos_n = normalized_dataset.apply(lambda x: cosine(x, P_norm), axis=1) cos_n.head() # In[31]: # Cosine Distance, standardized_dataset
def dist(hash_1: np.ndarray, hash_2: np.ndarray) -> float: return distance.cosine(hash_1, hash_2)
return np.asarray(X), lmbda if __name__ == '__main__': np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) nodes = data.load_digits() true_clusters = data.load_digits_clusters() report = [] num_nodes = nodes.shape[0] tm = [] print "Create edges from cosine similarity..." if not os.path.exists("outputs/q7/intermediates/cosine.npy"): for idx1 in range(num_nodes): for idx2 in range(idx1 + 1, num_nodes): tm.append([idx1, idx2, cosine(nodes[idx1], nodes[idx2])]) tm = np.asarray(tm) np.save('outputs/q7/intermediates/cosine.npy', tm) else: tm = np.load('outputs/q7/intermediates/cosine.npy') print "Done" print "Finding best lambda..." X, lmbda = calculate_lambda_edges(nodes, tm) report += [ "Saving output lambda finder graph to outputs/q7/lambda-finder.png..." ] print report[-1] report += [ "Lambda vs #isolate nodes --> " + np.array2string(X, separator=",") ]
# Step 4: PPMI matrix p_wc = M1 / M1.sum() p_w = M1.sum(axis=0) / M1.sum() p_c = M1.sum(axis=1) / M1.sum() M1_plus = np.nan_to_num(threshold(np.log(p_wc / np.outer(p_w, p_c)), threshmin=0)) # Step 5: SVD U, s, V = svd(M1_plus) M2_10 = U[:,:10] M2_50 = U[:,:50] M2_100 = U[:,:100] # Step 6: human scores S = rg65 # Step 7: model scores S_hat = {} models = ['M1', 'M1_plus', 'M2_10', 'M2_50', 'M2_100'] for model in models: M = eval(model) S_hat[model] = np.nan_to_num([ 1 - cosine(M[word_index[a],:], M[word_index[b],:]) for i,(a,b,_) in S.iterrows() ]) # Step 8: pearson R for model in models: print model, np.nan_to_num(pearsonr(S['score'].tolist(), S_hat[model])[0])
def cosine_distance(u, v): return distance.cosine(u, v)
def cosine(self): a = self.target b = self.library return 1 - distance.cosine(a, b)
metrics = Metrics(args.input) words = list(metrics.encoding_word_indices.keys()) alpha = .02 theta = .2 if args.dispersion: correct = 0 sum = 0 for i in tqdm(range(len(wbless.pairs))): w1, w2 = wbless.pairs[i] true_label = wbless.truth_vals[i] if w1 not in words or w2 not in words: continue entails = 0 cossim = 1 - cosine(metrics.mean(w1), metrics.mean(w2)) if cossim > theta: s = 1 - ((metrics.dispersion(w1) + alpha) / metrics.dispersion(w2)) if s > 0: entails = 1 correct += int(entails == true_label) sum += 1 print('Dispersion: {0}'.format(correct / float(sum))) if args.centroid: correct = 0 sum = 0 for i in tqdm(range(len(wbless.pairs))): w1, w2 = wbless.pairs[i] true_label = wbless.truth_vals[i] if w1 not in words or w2 not in words:
def test_cosine_regression_loss(): logging.debug('Testing cosine regression loss') np.random.seed(1234) model = CosineRegressionDirectionGetter(3) logging.debug(" - Identical vectors x: expecting -1") a = np.array([1, 0, 0]) b = np.array([1, 0, 0]) expected = np.array(-1) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Identical vectors y: expecting -1") a = np.array([0, 1, 0]) b = np.array([0, 1, 0]) expected = np.array(-1) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Identical vectors z: expecting -1") a = np.array([0, 0, 1]) b = np.array([0, 0, 1]) expected = np.array(-1) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Vectors with same angle: expecting -1") scales = np.random.random(20) * 20 for s in scales: a = np.array([1, 0, 0]) b = a * s expected = np.array(-1) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Vectors with at 90 degrees 1: expecting 0") a = np.array([1, 0, 0]) b = np.array([0, 1, 0]) expected = np.array(0) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Vectors with at 90 degrees 2: expecting 0") a = np.array([1, 0, 0]) b = np.array([0, 0, 1]) expected = np.array(0) value = _compute_loss_tensor(a, b, model) assert_equal(value, expected) logging.debug(" - Vectors with at 90 degrees random: expecting 0") for _ in range(20): a = _get_random_vector(3) b = _get_random_vector(3) c = np.cross(a, b) expected = np.array(0) value = _compute_loss_tensor(a, c, model) assert np.allclose(value, expected, atol=tol), \ "Failed; got: {}; expected: {}".format(value, expected) value = _compute_loss_tensor(b, c, model) assert np.allclose(value, expected, atol=tol), \ "Failed; got: {}; expected: {}".format(value, expected) logging.debug(" - Vectors with at 180 degrees random: expecting 1") for _ in range(20): a = _get_random_vector(3) b = np.array(-a * (np.random.random() + 1e-3) * np.random.randint(1, 10), dtype=np.float32) expected = np.array(1) value = _compute_loss_tensor(a, b, model) assert np.allclose(value, expected, atol=tol), \ "Failed; got: {}; expected: {}".format(value, expected) logging.debug(" - Random vectors: comparing with cosine.") for _ in range(200): a = _get_random_vector(3) b = _get_random_vector(3) # model outputs -cos(a,b), but cosine computes 1-cos(a,b) expected = cosine(a, b) - 1 value = _compute_loss_tensor(a, b, model) assert np.allclose(value, expected, atol=tol), \ "Failed; got: {}; expected: {}".format(value, expected)
def similarity(self, emb1, emb2): return cosine(emb1, emb2)
def compute_metrics(self, rendition_frame, next_rendition_frame, reference_frame, next_reference_frame): rendition_metrics = {} if self.profiling: self.evaluate_cross_correlation_instant = self.cpu_profiler( self.evaluate_cross_correlation_instant) self.evaluate_dct_instant = self.cpu_profiler( self.evaluate_dct_instant) self.evaluate_entropy_instant = self.cpu_profiler( self.evaluate_entropy_instant) self.evaluate_lbp_instant = self.cpu_profiler( self.evaluate_lbp_instant) self.evaluate_difference_canny_instant = self.cpu_profiler( self.evaluate_difference_canny_instant) self.evaluate_difference_instant = self.cpu_profiler( self.evaluate_difference_instant) self.evaluate_spatial_complexity = self.cpu_profiler( self.evaluate_spatial_complexity) self.evaluate_gaussian_instant = self.cpu_profiler( self.evaluate_gaussian_instant) self.evaluate_gaussian_difference_instant = self.cpu_profiler( self.evaluate_gaussian_difference_instant) self.evaluate_mse_instant = self.cpu_profiler( self.evaluate_mse_instant) self.evaluate_psnr_instant = self.cpu_profiler( self.evaluate_psnr_instant) self.evaluate_ssim_instant = self.cpu_profiler( self.evaluate_ssim_instant) self.rescale_pair = self.cpu_profiler(self.rescale_pair) # Some metrics only need the luminance channel reference_frame_gray = reference_frame rendition_frame_gray = rendition_frame next_rendition_frame_gray = next_rendition_frame for metric in self.metrics_list: if metric == 'temporal_histogram_distance': rendition_metrics[metric] = self.histogram_distance( reference_frame, rendition_frame) if metric == 'temporal_difference': # Compute the temporal inter frame difference rendition_metrics[metric] = self.evaluate_difference_instant( rendition_frame_gray, next_rendition_frame_gray) if metric == 'temporal_psnr': # Compute the temporal inter frame psnr rendition_metrics[metric] = self.evaluate_psnr_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_ssim': # Compute the temporal inter frame ssim rendition_metrics[metric] = self.evaluate_ssim_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_mse': # Compute the temporal inter frame mse rendition_metrics[metric] = self.evaluate_mse_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_canny': # Compute the temporal inter frame difference of the canny version of the frame rendition_metrics[ metric] = self.evaluate_difference_canny_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_cross_correlation': rendition_metrics[ metric] = self.evaluate_cross_correlation_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_dct': rendition_metrics[metric] = self.evaluate_dct_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_gaussian': rendition_metrics[metric] = self.evaluate_gaussian_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_gaussian_difference': rendition_metrics[ metric] = self.evaluate_gaussian_difference_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_spatial_complexity': rendition_metrics[metric] = self.evaluate_spatial_complexity( reference_frame_gray) if metric == 'temporal_entropy': rendition_metrics[metric] = self.evaluate_entropy_instant( reference_frame_gray, rendition_frame_gray) if metric == 'temporal_lbp': rendition_metrics[metric] = self.evaluate_lbp_instant( reference_frame_gray, rendition_frame_gray) # Compute the hash of the target frame rendition_hash = self.dhash(rendition_frame) # Extract the dhash for the reference frame reference_hash = self.dhash(reference_frame) # Compute different distances with the hash if metric == 'hash_euclidean': rendition_metrics['hash_euclidean'] = distance.euclidean( reference_hash, rendition_hash) if metric == 'hash_hamming': rendition_metrics['hash_hamming'] = distance.hamming( reference_hash, rendition_hash) if metric == 'hash_cosine': rendition_metrics['hash_cosine'] = distance.cosine( reference_hash, rendition_hash) return rendition_metrics
def build_features(data): char_model = gensim.models.KeyedVectors.load_word2vec_format( '../data/char_w2v.txt') word_model = gensim.models.KeyedVectors.load_word2vec_format( '../data/word_w2v.txt') X = pd.DataFrame() X['word_wmd'] = data.apply( lambda x: wmd(x['word1'], x['word2'], word_model), axis=1) X['char_wmd'] = data.apply( lambda x: wmd(x['char1'], x['char2'], char_model), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.word1.values)): question1_vectors[i, :] = sent2vec(q, word_model) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.word2.values)): question2_vectors[i, :] = sent2vec(q, word_model) char_question1_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.char1.values)): char_question1_vectors[i, :] = sent2vec(q, char_model) char_question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.char2.values)): char_question2_vectors[i, :] = sent2vec(q, char_model) # X['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] X['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] X['char_skew_q1vec'] = [ skew(x) for x in np.nan_to_num(char_question1_vectors) ] X['char_skew_q2vec'] = [ skew(x) for x in np.nan_to_num(char_question2_vectors) ] X['char_kur_q1vec'] = [ kurtosis(x) for x in np.nan_to_num(char_question1_vectors) ] X['char_kur_q2vec'] = [ kurtosis(x) for x in np.nan_to_num(char_question2_vectors) ] return X
def get_cosine_distance(self, document_1, document_2): return 1 - distance.cosine(document_1, document_2)
def calc_cosine(source_representation, test_representation): result = 1 - distance.cosine(source_representation[0], test_representation[0]) print("Cosine Similarity : {}".format(result)) return result
def post(request): def getSimilarityScore(history, similarities): return sum(history * similarities) / sum(similarities) received_json_data = json.loads(request.body, encoding="utf-8-sig") received_json_data = received_json_data['data'] df = pd.DataFrame(received_json_data) df = df.fillna(1) df = pd.DataFrame(df).astype(int) df = pd.pivot_table(df, values='attending', index=['userId'], columns=['eventId']) df = df.reset_index() df = df.fillna(1) data_file = df.drop('userId', 1) # Dataframe for item vs. item similarity scores data_item_based_similarity = pd.DataFrame(index=data_file.columns, columns=data_file.columns) data_item_based_similarity.reset_index() # Lets fill in those empty spaces with cosine similarities # Loop through the columns for column in range(0, len(data_item_based_similarity.columns)): # Loop through the columns for each column for row in range(0, len(data_item_based_similarity.columns)): # Fill in placeholder with cosine similarities data_item_based_similarity.iloc[column, row] = 1 - cosine( data_file.iloc[:, column], data_file.iloc[:, row]) # Create placeholder items for closest neighbours to an item data_neighbours = pd.DataFrame(index=data_item_based_similarity.columns, columns=[range(1, 11)]) # Loop through our similarity dataframe and fill in neighbouring item names for column in range(0, len(data_item_based_similarity.columns)): data_neighbours.iloc[column, :10] = data_item_based_similarity.iloc[ 0:, column].order(ascending=False)[:10].index # Create a place holder matrix for similarities, and fill in the user name column. data_similarity = pd.DataFrame(index=df.index, columns=df.columns) data_similarity.iloc[:, :1] = df.iloc[:, :1] print(len(data_similarity.index)) # Loop through all rows, skipping the user column, and fill with similarity scores. for column in range(1, len(data_similarity.index)): stdout.write("\r%d" % column) stdout.flush() for row in range(1, len(data_similarity.columns)): user = data_similarity.index[column] event = data_similarity.columns[row] # If an event has already been attended, do not recommend it. if df.iloc[column][row] == 1: data_similarity.iloc[column][row] = 0 else: event_top_names = data_neighbours.loc[event][1:10] event_top_sims = data_item_based_similarity.ix[event].order( ascending=False)[1:10] user_purchases = data_file.ix[user, event_top_names] data_similarity.iloc[column][row] = getSimilarityScore( user_purchases, event_top_sims) # Get the top 6 events for each user and store in a DateFrame. data_recommend = pd.DataFrame( index=data_similarity.index, columns=['userId', '1', '2', '3', '4', '5', '6']) data_recommend.iloc[0:, 0] = data_similarity.iloc[:, 0] # Instead of top event scores, we want to see eventId numbers so they can be passed back to the app. for column in range(0, len(data_similarity.index)): data_recommend.iloc[column, 1:] = data_similarity.iloc[column, :].order( ascending=False).iloc[1:7, ].index.transpose() # Return all recommendations in response to HTTP post to be parsed on the client side. print(data_recommend.to_string) json_recommend = data_recommend.to_json(orient='index') print(json_recommend) if json_recommend is not None: return JsonResponse(json_recommend, content_type='json', safe=False)
def quotation_spec_match(quote_limits, spec_limits): quotes_dict = {} spec_dict = {} for quote in quote_limits: data_splits = re.findall(r'(.*)\s(\d+)', quote)[0] if data_splits != '': quotes_dict.update({data_splits[0]: data_splits[1]}) for spec in spec_limits: data_splits = re.findall(r'(\d+)\s(.*)', spec)[0] if data_splits[1] != '': spec_dict.update({data_splits[1]: data_splits[0]}) quotes_df = pd.DataFrame({ 'Description': list(quotes_dict.keys()), 'Limit_in_Dollars': list(quotes_dict.values()) }) spec_df = pd.DataFrame({ 'Description': list(spec_dict.keys()), 'Limit_in_Dollars': list(spec_dict.values()) }) quotes_df_X = quotes_df['Description'].values spec_df_X = spec_df['Description'].values quotes_df_X_cv = cv.fit_transform(quotes_df_X).toarray() spec_df_X_cv = cv.transform(spec_df_X).toarray() quotes_final_df = pd.DataFrame(quotes_df_X_cv, columns=cv.get_feature_names()) quotes_final_df['Limit_in_Dollars'] = quotes_df['Limit_in_Dollars'].apply( pd.to_numeric) spec_final_df = pd.DataFrame(spec_df_X_cv, columns=cv.get_feature_names()) spec_final_df['Limit_in_Dollars'] = spec_df['Limit_in_Dollars'].apply( pd.to_numeric) matched_quotes = [] matched_spec = [] matched_limit = [] for row_in_quote, i in enumerate(quotes_final_df['Limit_in_Dollars']): for row_in_spec, j in enumerate(spec_final_df['Limit_in_Dollars']): if i == j: cos_val = cosine(quotes_final_df.iloc[row_in_quote, :-1], spec_final_df.iloc[row_in_spec, :-1]) if cos_val < 0.55: print("Matched Quotes and Specs are below") print("-" * 40) print(quote_limits[row_in_quote], '|||||', spec_limits[row_in_spec]) matched_quotes.append(quote_limits[row_in_quote]) matched_spec.append(spec_limits[row_in_spec]) matched_limit.append("MATCHED") print("=" * 40) unmatched_quotes = [i for i in quote_limits if i not in matched_quotes] print("\n\nUnmatched Quotes are") print('-' * 40) print(unmatched_quotes) matched_df = pd.DataFrame({ "Quotations": matched_quotes, "Limits": matched_limit, "Specifications": matched_spec }) unmatched_df = pd.DataFrame({ "Quotations": unmatched_quotes, "Limits": ["NOT MATCHED" for i in range(len(unmatched_quotes))] }) df = pd.concat([matched_df, unmatched_df], axis=0) return df