def get_vector_cosine_distances(self): vectors = self.get_skip_thought_features() sims = [cosine_distance(vectors[0], vectors[1])] #no i-1 for first timestep for i in range(1, len(vectors)): sims.append(cosine_distance(vectors[i], vectors[i - 1])) return sims
def suggest(text=None): ffms = read_ffms() t = request.form['text'] if not t: return render_template('cards.html', text=None) text_score = model[t] distances = {} for (i, text) in enumerate(ffms["faerdiheder"]): ffm_score = model[text] distance = cosine_distance(text_score, ffm_score) distances[float(distance)] = i distances = OrderedDict(sorted(distances.items(), key=lambda item: item[0])) rendered = [] for i in distances: index = distances[i] data = { "first": (i == list(distances.keys())[0]), "score": "{0:.1f}%".format((1 - i) * 100), "faerdighed": ffms["faerdiheder"][index], "viden": ffms["viden"][index] } rendered.append(render_template('card.html', data=data)) return render_template('cards.html', text=(' '.join(rendered[0:5])))
def calculate_weight(a1, a2, nonnegative=False, distance_metric='fm2011'): ''' Calculate connection weight between two agents (Equation [1]) ''' o1 = a1.opinions o2 = a2.opinions if distance_metric == 'fm2011': if o1.shape != o2.shape: raise RuntimeError("Agent's opinion vectors have different shapes") K = len(o1) diff = abs(o2 - o1) numerator = np.sum(diff) if nonnegative: nonneg_fac = 2.0 else: nonneg_fac = 1.0 return 1 - (numerator / (nonneg_fac * K)) elif distance_metric == 'cosine_distance': # Weight is 1 - distance. Cosine distance ranges from 0 to 2. return 1.0 - cosine_distance(o1, o2) else: raise RuntimeError('Distance metric not recognized')
def calculate(self): v1 = self.get_vector(self.keypair[0]) v2 = self.get_vector(self.keypair[1]) # we store these as 1=congruent distance = 1 - cosine_distance(v1, v2) if isnan(distance): distance = 0 return distance
def test_cosine(self): vec_a = np.random.rand(3) vec_b = np.random.rand(3) lhs = Helper.cosine(vec_a, vec_b) rhs = 1 - cosine_distance(vec_a, vec_b) eps = 1e-6 self.assertAlmostEqual(lhs, rhs, delta=eps)
def scaled_cosine_similarity(vector1, vector2): """ Returns number between 0 and 1. Two equal vectors return similarity 1. Two oposite vectors return 0. How should we treat vectors with an angle between them bigger than 90 degree? Should we ignore the direction and just return abs. value of the cos.? """ assert vector1.shape == vector2.shape assert len(vector1.shape) == 1 return (2.0 - cosine_distance(vector1, vector2)) / 2
def update_expert_effort(self, u): self.queried_ids.append(u) if len(self.queried_ids) < 2: self.efforts.append(0) else: i, j = self.queried_ids[-1], self.queried_ids[-2] e = cosine_distance(self.X[i], self.X[j]) self.efforts.append(self.efforts[-1] + e)
def get_similarity(self, feature1, feature2, method='intersection'): if method == 'cosine': similarity = 1 - cosine_distance(feature1, feature2) elif method == 'intersection': minima = np.minimum(feature1, feature2) maxima = np.maximum(feature1, feature2) similarity = np.true_divide(np.sum(minima), np.sum(maxima)) return similarity
def perform(self, v1: np.ndarray, v2: np.ndarray): """ Calculates the cosine similarity between v1 and v2 """ if not v1.any() or not v2.any(): return 0 else: # Cosine_distance is defined in the scipy library as 1 - cosine_similarity, so: # 1 - cosine_distance = 1 - (1 - cosine_similarity) = cosine_similarity return 1 - cosine_distance(v1, v2)
def parse_id(self, inference): #checks if persons id is in the list and gives its label arr = inference['658'][0, :, 0, 0] if len(self.hashes) == 0: self.hashes.append(arr) return 0 if len(self.hashes) >= 1: for i in range(len(self.hashes)): if cosine_distance(arr, self.hashes[i]) < self.theta: return i self.hashes.append(arr) return (len(self.hashes) - 1)
def cosine_variance(space): vectors = len(space) # Calcolo le distanze coseno fra tutti i vettori cos_dis = [] for i in range(vectors): for j in range(i + 1, vectors): v1 = space[i] v2 = space[j] distance = cosine_distance(v1, v2) cos_dis.append(distance) cos_avg = numpy.average(cos_dis) cos_var = numpy.var(cos_dis) return cos_avg, cos_var
def cosine_similarity(X: pd.DataFrame, y: pd.DataFrame=None): """ Độ đo cosin giữa các hàng của mt1 với các hàng của mt2 """ if y is None: y = np.array(X) sim_matrix = pd.DataFrame( np.array([[1 - cosine_distance(X.iloc[i], y.iloc[j]) for j in range(len(y))] for i in range(len(X))])) if hasattr(X, 'index'): sim_matrix.index = X.index if hasattr(y, 'index'): sim_matrix.columns = y.index return sim_matrix
def test_cosine_many_to_many(self): xs = np.random.rand(100, 3) ys = np.random.rand(120, 3) cosines1 = Helper.cosine_many_to_many(xs, ys) cosines2 = np.zeros_like(cosines1) for i in range(cosines2.shape[0]): for j in range(cosines2.shape[1]): cosines2[i, j] = 1 - cosine_distance(xs[i, :], ys[j, :]) errors = cosines1 - cosines2 max_error = np.max(np.abs(errors)) tolerance = 1e-6 self.assertAlmostEqual(max_error, 0.0, delta=tolerance)
def __init__(self, paragraph_data: List[Dict[str, Argument]], question_tokens: List[Token], embedding_for_token_similarity: Dict[str, numpy.ndarray], distance_threshold: float) -> None: self.paragraph_data = paragraph_data self.question_tokens = question_tokens self._paragraph_strings: Dict[str, List[str]] = defaultdict(list) self._paragraph_lemmas: Dict[str, List[str]] = defaultdict(list) for structure in paragraph_data: for relation, argument in structure.items(): self._paragraph_strings[argument.argument_string].append(relation) self._paragraph_lemmas["_".join(argument.argument_lemmas)].append(relation) self._knowledge_graph: KnowledgeGraph = None self.paragraph_tokens_to_keep: List[Tuple[str, List[str]]] = [] if embedding_for_token_similarity is not None: # We'll use this word embedding to measure similarity between paragraph tokens and # question tokens to decide if we should extract paragraph tokens as entities in the # context. # Tuples of paragraph strings, list of relation names, and embeddings of paragraph strings. paragraph_token_embedding: List[Tuple[str, List[str], numpy.ndarray]] = [] for paragraph_string, relation_names in self._paragraph_strings.items(): # paragraph string is already tokenized but has underscores for spaces. for token in paragraph_string.split("_"): if token in STOP_WORDS: continue if token not in embedding_for_token_similarity: continue token_embedding = embedding_for_token_similarity[token] paragraph_token_embedding.append((token, relation_names, token_embedding)) # We keep the embeddings of tokens in the question that are not stop words. question_token_embedding: List[numpy.ndarray] = [] for question_token in question_tokens: token_text = question_token.text if token_text not in STOP_WORDS and token_text in embedding_for_token_similarity: question_token_embedding.append(embedding_for_token_similarity[token_text]) if question_token_embedding: for paragraph_token, relation_names, token_embedding in paragraph_token_embedding: min_distance = min([cosine_distance(token_embedding, question_embedding) for question_embedding in question_token_embedding]) if 0.0 < min_distance < distance_threshold: # If min_distance is 0.0, it means it is the exact word, and our exact string # match will get it anyway. self.paragraph_tokens_to_keep.append((paragraph_token, relation_names))
def weight_graph(graph, embeddings): """ Weight graph edges by similarity between words """ for edge in graph: try: embedding_0 = embeddings[edge.words[0].lower()] except KeyError: embedding_0 = embeddings['UNK'] try: embedding_1 = embeddings[edge.words[1].lower()] except KeyError: embedding_1 = embeddings['UNK'] edge.similarity = 1.0 - cosine_distance(embedding_0, embedding_1) return graph
def main(): """ Main function """ # Building a k-nearest neighbor graph using annoy and cosine distance annoy = AnnoyIndex(len(DATA.columns), metric="angular") annoy_graph = [] for i, v in enumerate(DATA.values): annoy.add_item(i, v) annoy.build(10) for i in range(len(DATA)): for j in annoy.get_nns_by_item(i, 10): annoy_graph.append( (i, j, cosine_distance(DATA.values[i], DATA.values[j]))) # Creating the tmap layout x, y, s, t, _ = tm.layout_from_edge_list(len(DATA), annoy_graph) faerun = Faerun(view="front", coords=False) faerun.add_scatter( "MINIBOONE", { "x": x, "y": y, "c": LABELS, "labels": LABELS }, shader="smoothCircle", colormap="Set1", point_scale=2.0, max_point_size=20, has_legend=True, categorical=True, legend_labels={(0, "Noise"), (1, "Signal")}, ) faerun.add_tree( "MINIBOONE_tree", { "from": s, "to": t }, point_helper="MINIBOONE", color="#666666", ) faerun.plot("miniboone", template="default")
def correlate_distances(frames, do_plot=False): dist_matrix = [] dist_nonself_matrix = {} dist_dense_matrix = [] index = 0 for anchor_frame in frames: dist_matrix.append([]) dist_nonself_matrix[anchor_frame] = {} dist_dense_matrix.append([]) for compare_frame in frames: af = Util.features[anchor_frame] bf = Util.features[compare_frame] dist = cosine_distance(af, bf) dist_matrix[index].append(dist) # print('{0:s} <-> {1:s} = {2:.2f}'.format(anchor_frame, compare_frame, dist)) if anchor_frame != compare_frame: dist_dense_matrix[index].append(dist) dist_nonself_matrix[anchor_frame][compare_frame] = dist index = index + 1 if do_plot: column_labels = frames row_labels = frames fig, axes = plt.subplots() heatmap = axes.pcolor(dist_matrix, cmap=plt.cm.Blues, alpha=0.8) fig = plt.gcf() fig.set_size_inches(8, 11) axes.set_yticks(np.arange(len(dist_matrix)) + 0.5, minor=False) axes.set_xticks(np.arange(len(dist_matrix)) + 0.5, minor=False) axes.set_xticklabels(frames, minor=False) axes.set_yticklabels(frames, minor=False) plt.xticks(rotation=90) plt.title('Keyframes Cosine Distance Correlation') return dist_dense_matrix, dist_nonself_matrix
def compute_distance(self): si_0 = Util.stringify(self.current_frame - 1) si_1 = Util.stringify(self.current_frame) return cosine_distance(self.features[si_0], self.features[si_1])
def distance(self, point): return cosine_distance(self.vec, point.vec)
print("text: '", text, "' does not have a correct FFM. Skipped.") continue # text_vectors = list(map(lambda word: model[word], clean(text).split())) texts[text] = { "text": text, "skill": ffm_skill, "knowledge": ffm_knowledge, "vector": model[clean(text)] #np.sum(text_vectors, axis=0) } for text, text_value in texts.items(): text_value["distance"] = {} for ffm_text, ffm_value in ffms.items(): distance = cosine_distance(text_value["vector"], ffm_value["vector"]) text_value["distance"][str(distance)] = { "ffm": ffm_text, "match": (normalize(text_value["skill"]) == normalize(ffm_text)) } offline_accuracy = [] for i in range(len(ffms)): a = 0 for k, v in texts.items(): distances = OrderedDict( sorted(v["distance"].items(), key=lambda item: item[0])) matches = [m["match"] for m in [v for v in distances.values()]] a += np.sum(matches[0:(i + 1)]) offline_accuracy.append(a / len(texts))
def cosine(cls, x: np.ndarray, y: np.ndarray) -> float: return 1 - cosine_distance(x, y)
def main(): """ Compute local neighborhood distance for target pairs from two vector spaces. """ # Get the arguments args = docopt( """Compute local neighborhood distance for target pairs from two vector spaces. Usage: lnd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <k> <testset> = path to file with tab-separated word pairs <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath> = output path for result file <k> = parameter k (k nearest neighbors) Options: -f, --fst write only first target in output file -s, --scd write only second target in output file """) is_fst = args['--fst'] is_scd = args['--scd'] testset = args['<testset>'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath = args['<outPath>'] k = int(args['<k>']) #logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space1 = Space(matrixPath1, format='npz') except ValueError: space1 = Space(matrixPath1, format='w2v') try: space2 = Space(matrixPath2, format='npz') except ValueError: space2 = Space(matrixPath2, format='w2v') matrix1 = space1.matrix row2id1 = space1.row2id id2row1 = space1.id2row matrix2 = space2.matrix row2id2 = space2.row2id id2row2 = space2.id2row # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in f_in] nbrs1 = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute').fit(matrix1) nbrs2 = NearestNeighbors(n_neighbors=k, metric='cosine', algorithm='brute').fit(matrix2) scores = {} neighborUnionSizes = {} for (t1, t2) in targets: # Get nearest neighbors try: index1 = row2id1[t1] index2 = row2id2[t2] except KeyError: scores[(t1, t2)] = 'nan' neighborUnionSizes[(t1, t2)] = 'nan' continue v1 = matrix1[index1].toarray().flatten() v2 = matrix2[index2].toarray().flatten() distances1, indices1 = nbrs1.kneighbors(matrix1[index1]) distances2, indices2 = nbrs2.kneighbors(matrix2[index2]) neighbors1 = list( zip([id2row1[i] for i in indices1.flatten().tolist()], distances1.flatten().tolist())) neighbors2 = list( zip([id2row2[i] for i in indices2.flatten().tolist()], distances2.flatten().tolist())) neighborUnion = sorted( list( set([ a for (a, b) in neighbors1 + neighbors2 if (a in row2id1 and a in row2id2 and not a in [t1, t2]) ]))) # Filter out vectors with 0-length in either matrix neighborUnion = [ a for a in neighborUnion if (len(matrix1[row2id1[a]].data) > 0 and len(matrix2[row2id2[a]].data) > 0) ] simVec1 = [ 1.0 - cosine_distance(matrix1[index1].toarray().flatten(), matrix1[row2id1[n]].toarray().flatten()) for n in neighborUnion ] simVec2 = [ 1.0 - cosine_distance(matrix2[index2].toarray().flatten(), matrix2[row2id2[n]].toarray().flatten()) for n in neighborUnion ] # Compute cosine distance of vectors distance = cosine_distance(simVec1, simVec2) scores[(t1, t2)] = distance neighborUnionSizes[(t1, t2)] = len(neighborUnion) with open(outPath, 'w', encoding='utf-8') as f_out: for (t1, t2) in targets: if is_fst: # output only first target string f_out.write('\t'.join( (t1, str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) elif is_scd: # output only second target string f_out.write('\t'.join( (t2, str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) else: # standard outputs both target strings f_out.write('\t'.join( ('%s,%s' % (t1, t2), str(scores[(t1, t2)]), str(neighborUnionSizes[(t1, t2)]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute cosine distance for targets in two matrices. """ # Get the arguments args = docopt("""Compute cosine distance for targets in two matrices. Usage: cd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <testset> = path to file with tab-separated word pairs <matrixPath1> = path to matrix1 <matrixPath2> = path to matrix2 <outPath> = output path for result file Options: -f, --fst write only first target in output file -s, --scd write only second target in output file Note: Important: spaces must be already aligned (columns in same order)! Targets in first/second column of testset are computed from matrix1/matrix2. """) is_fst = args['--fst'] is_scd = args['--scd'] testset = args['<testset>'] matrixPath1 = args['<matrixPath1>'] matrixPath2 = args['<matrixPath2>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load matrices and rows try: space1 = Space(matrixPath1, format='npz') except ValueError: space1 = Space(matrixPath1, format='w2v') try: space2 = Space(matrixPath2, format='npz') except ValueError: space2 = Space(matrixPath2, format='w2v') matrix1 = space1.matrix row2id1 = space1.row2id matrix2 = space2.matrix row2id2 = space2.row2id # Load targets with open(testset, 'r', encoding='utf-8') as f_in: targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in f_in] scores = {} for (t1, t2) in targets: # Get row vectors try: v1 = matrix1[row2id1[t1]].toarray().flatten() v2 = matrix2[row2id2[t2]].toarray().flatten() except KeyError: scores[(t1, t2)] = 'nan' continue # Compute cosine distance of vectors distance = cosine_distance(v1, v2) scores[(t1, t2)] = distance with open(outPath, 'w', encoding='utf-8') as f_out: for (t1, t2) in targets: if is_fst: # output only first target string f_out.write('\t'.join((t1, str(scores[(t1, t2)]) + '\n'))) elif is_scd: # output only second target string f_out.write('\t'.join((t2, str(scores[(t1, t2)]) + '\n'))) else: # standard outputs both target strings f_out.write('\t'.join( ('%s,%s' % (t1, t2), str(scores[(t1, t2)]) + '\n'))) logging.info("--- %s seconds ---" % (time.time() - start_time))
argparser = get_argparser() args = argparser.parse_args() modelname = args.model print('Loading BERT model...') encoder = WrappedBERTEncoder(model=modelname, tokenizer=modelname) print('BERT Contextual Similarities') sentence1 = input('Sentence 1? ') sentence2 = input('Sentence 2? ') sentences_embeddings, embeddings, tokenized_texts = encoder.encode_sentences( [sentence1, sentence2]) # sentence similarity sentence_similarity = 1 - cosine_distance(sentences_embeddings[0], sentences_embeddings[1]) print('Cosine similarity between two sentences: {}'.format( sentence_similarity)) # token similarities simmatrix = np.zeros((len(tokenized_texts[0]), len(tokenized_texts[1]))) for i, j in product(range(len(tokenized_texts[0])), range(len(tokenized_texts[1]))): simmatrix[i, j] = 1 - cosine_distance(embeddings[0, i], embeddings[1, j]) simdf = pd.DataFrame(simmatrix) simdf.columns = tokenized_texts[1] simdf.index = tokenized_texts[0] print(simdf)
def cosine_similarity(a, b): return 1.0 - cosine_distance(a, b)
def cosine_similarity(v1, v2): return 1 - cosine_distance(v1, v2)
def main(): """ Main function """ data = [] time = [] for path in PATHS: sample = fk.Sample(path) data.append(load_data(sample)) time.append(load_time(sample)) sources = [] for i, e in enumerate(data): sources.extend([i] * len(e)) data = np.concatenate(data, axis=0) time = np.concatenate(time, axis=0) d = len(data[0]) # Initialize a new Annoy object and index it using 10 trees annoy = AnnoyIndex(d, metric="angular") for i, v in enumerate(data): annoy.add_item(i, v) annoy.build(10) # Create the k-nearest neighbor graph (k = 10) edge_list = [] for i in range(len(data)): for j in annoy.get_nns_by_item(i, 10): edge_list.append((i, j, cosine_distance(data[i], data[j]))) # Compute the layout from the edge list x, y, s, t, _ = tm.layout_from_edge_list(len(data), edge_list) legend_labels = [(0, "No Target Probe Negative Control"), (1, "Stained Sample")] # Create the plot faerun = Faerun( view="front", coords=False, legend_title= "RNA Flow Cytometry: evaluation of detection sensitivity in low abundant intracellular RNA ", ) faerun.add_scatter( "CYTO", { "x": x, "y": y, "c": sources, "labels": sources }, point_scale=1.0, max_point_size=10, shader="smoothCircle", colormap="Set1", has_legend=True, categorical=True, legend_labels=legend_labels, legend_title="Cell Types", ) faerun.add_tree("CYTO_tree", { "from": s, "to": t }, point_helper="CYTO", color="#222222") faerun.plot("cyto")
def text_only_main(image_text_string, ings_dict, definitions_dict): # # define ingredient embedddings file # ings_dict_fp = '/Volumes/ja2/vegan/vegan_parser/data_source/ingredient_dictionary.p' # # try: # ings_dict = pickle.load(open(ings_dict_fp, "rb")) # except FileNotFoundError: # "Ingredients data not found!" cleaned_ings_list = create_ingredients_list(image_text_string) print(f'cleaned_ings_list: {cleaned_ings_list}') print() # Look up the data and pull out the relevant ingredients. # how should that be stored... list of dictionaries? ings_dict_list = [] for ing in cleaned_ings_list: print(f'Working on ingredient: {ing}') # look up ing in the dict ing_entry: Dict = ings_dict.get(ing) # if entry exists if ing_entry: # check if word vectors exist & create if needed, or just get them # print(ing_entry) if ings_dict[ing].get('ewg_vector') is None: try: # use the chemical_about phrase to create a document vector phrase = ings_dict[ing].get('chemical_about') if phrase: ewg_docvec = phrase_to_docvec(phrase, doc_embedding) else: ewg_docvec = None ings_dict = add_to_embedding_dict(ing, ings_dict, 'ewg', ewg_docvec) ings_dict[ing]['ewg_vector'] = ewg_docvec except: ewg_docvec = None else: # get the document vector if exists ewg_docvec: Union[None, np.array] = ings_dict[ing].get('ewg_vector') ing_entry['ewg_vector'] = ewg_docvec if ings_dict[ing].get('wiki_vector') is None: # ing is in dict but does not have wiki_vector try: # use the wikipedia entry to create a document vector phrase = get_wiki_phrase(ing) if phrase: wiki_docvec = phrase_to_docvec(phrase, doc_embedding) else: wiki_docvec = None except: wiki_docvec = None ings_dict = add_to_embedding_dict(ing, ings_dict, 'wiki', wiki_docvec) ings_dict[ing]['wiki_vector'] = wiki_docvec else: # wiki_vector exists wiki_docvec: Union[None, np.array] = ings_dict.get('wiki_vector') ing_entry['wiki_vector'] = wiki_docvec # Now compare the ingredient against the vectors and find closest definition if (ing_entry.get('wiki_vector') is None) and (ing_entry.get('ewg_vector') is not None): vector = ing_entry.get('ewg_vector') elif (ing_entry.get('wiki_vector') is not None) and (ing_entry.get('ewg_vector') is None): vector = ing_entry.get('wiki_vector') elif (ing_entry.get('wiki_vector') is not None) and (ing_entry.get('ewg_vector') is not None): vector = np.mean([ing_entry.get('wiki_vector'), ing_entry.get('ewg_vector')], axis=0) else: vector = None # Then take the cosine_similarity with all 4 reference vectors to get class prev_min_key = None prev_min_distance = None for key in definitions_dict.keys(): print(key) print(cosine_distance(definitions_dict[key], vector)) if prev_min_key is None: prev_min_key = key min_key = key prev_min_distance = cosine_distance(definitions_dict[key], vector) else: if cosine_distance(definitions_dict[key], vector) < prev_min_distance: min_key = key min_dist = cosine_distance(definitions_dict[key], vector) prev_min_distance = min_dist # prev_min_key = key # And add the type to the ingredient ing_entry['type'] = min_key.split('_')[0] # because of the way I designed the names of the dictionary # add to entry for this ing ings_dict_list.append(ing_entry) # entry does not exist in dict for whatever reason else: # ings_dict_list.append('DID NOT FIND') ings_dict_list.append(None) # ings_dict_list is ready, now need to combine the data...somehow # print(f'ings_dict_list: {ings_dict_list}') return cleaned_ings_list, ings_dict_list