コード例 #1
0
 def get_vector_cosine_distances(self):
     vectors = self.get_skip_thought_features()
     sims = [cosine_distance(vectors[0],
                             vectors[1])]  #no i-1 for first timestep
     for i in range(1, len(vectors)):
         sims.append(cosine_distance(vectors[i], vectors[i - 1]))
     return sims
コード例 #2
0
ファイル: techdemo.py プロジェクト: eugene/gyldendal
def suggest(text=None):
    ffms = read_ffms()
    t = request.form['text']
    if not t:
        return render_template('cards.html', text=None)

    text_score = model[t]
    distances = {}
    for (i, text) in enumerate(ffms["faerdiheder"]):
        ffm_score = model[text]
        distance = cosine_distance(text_score, ffm_score)
        distances[float(distance)] = i

    distances = OrderedDict(sorted(distances.items(),
                                   key=lambda item: item[0]))
    rendered = []
    for i in distances:
        index = distances[i]
        data = {
            "first": (i == list(distances.keys())[0]),
            "score": "{0:.1f}%".format((1 - i) * 100),
            "faerdighed": ffms["faerdiheder"][index],
            "viden": ffms["viden"][index]
        }
        rendered.append(render_template('card.html', data=data))

    return render_template('cards.html', text=(' '.join(rendered[0:5])))
コード例 #3
0
def calculate_weight(a1, a2, nonnegative=False, distance_metric='fm2011'):
    '''
    Calculate connection weight between two agents (Equation [1])
    '''
    o1 = a1.opinions
    o2 = a2.opinions

    if distance_metric == 'fm2011':
        if o1.shape != o2.shape:
            raise RuntimeError("Agent's opinion vectors have different shapes")
        K = len(o1)

        diff = abs(o2 - o1)
        numerator = np.sum(diff)

        if nonnegative:
            nonneg_fac = 2.0
        else:
            nonneg_fac = 1.0

        return 1 - (numerator / (nonneg_fac * K))

    elif distance_metric == 'cosine_distance':
        # Weight is 1 - distance. Cosine distance ranges from 0 to 2.
        return 1.0 - cosine_distance(o1, o2)

    else:
        raise RuntimeError('Distance metric not recognized')
コード例 #4
0
 def calculate(self):
     v1 = self.get_vector(self.keypair[0])
     v2 = self.get_vector(self.keypair[1])
     # we store these as 1=congruent
     distance = 1 - cosine_distance(v1, v2)
     if isnan(distance):
         distance = 0
     return distance
コード例 #5
0
 def calculate(self):
     v1 = self.get_vector(self.keypair[0])
     v2 = self.get_vector(self.keypair[1])
     # we store these as 1=congruent
     distance = 1 - cosine_distance(v1, v2)
     if isnan(distance):
         distance = 0
     return distance
コード例 #6
0
ファイル: test.py プロジェクト: hsiaofongw/cube
    def test_cosine(self):

        vec_a = np.random.rand(3)
        vec_b = np.random.rand(3)

        lhs = Helper.cosine(vec_a, vec_b)
        rhs = 1 - cosine_distance(vec_a, vec_b)
        eps = 1e-6
        self.assertAlmostEqual(lhs, rhs, delta=eps)
コード例 #7
0
def scaled_cosine_similarity(vector1, vector2):
    """
    Returns number between 0 and 1. Two equal vectors return similarity 1. Two oposite vectors return 0.
    How should we treat vectors with an angle between them bigger than 90 degree?
    Should we ignore the direction and just return abs. value of the cos.?
    """
    assert vector1.shape == vector2.shape
    assert len(vector1.shape) == 1
    return (2.0 - cosine_distance(vector1, vector2)) / 2
コード例 #8
0
    def update_expert_effort(self, u):
        self.queried_ids.append(u)

        if len(self.queried_ids) < 2:
            self.efforts.append(0)
        else:
            i, j = self.queried_ids[-1], self.queried_ids[-2]
            e = cosine_distance(self.X[i], self.X[j])
            self.efforts.append(self.efforts[-1] + e)
コード例 #9
0
    def get_similarity(self, feature1, feature2, method='intersection'):
        if method == 'cosine':
            similarity = 1 - cosine_distance(feature1, feature2)

        elif method == 'intersection':
            minima = np.minimum(feature1, feature2)
            maxima = np.maximum(feature1, feature2)
            similarity = np.true_divide(np.sum(minima), np.sum(maxima))
        return similarity
コード例 #10
0
    def perform(self, v1: np.ndarray, v2: np.ndarray):
        """
        Calculates the cosine similarity between v1 and v2
        """

        if not v1.any() or not v2.any():
            return 0
        else:
            # Cosine_distance is defined in the scipy library as 1 - cosine_similarity, so:
            # 1 - cosine_distance = 1 - (1 - cosine_similarity) = cosine_similarity
            return 1 - cosine_distance(v1, v2)
コード例 #11
0
 def parse_id(self, inference):
     #checks if persons id is in the list and gives its label
     arr = inference['658'][0, :, 0, 0]
     if len(self.hashes) == 0:
         self.hashes.append(arr)
         return 0
     if len(self.hashes) >= 1:
         for i in range(len(self.hashes)):
             if cosine_distance(arr, self.hashes[i]) < self.theta:
                 return i
     self.hashes.append(arr)
     return (len(self.hashes) - 1)
コード例 #12
0
def cosine_variance(space):
    vectors = len(space)
    # Calcolo le distanze coseno fra tutti i vettori
    cos_dis = []
    for i in range(vectors):
        for j in range(i + 1, vectors):
            v1 = space[i]
            v2 = space[j]
            distance = cosine_distance(v1, v2)
            cos_dis.append(distance)
    cos_avg = numpy.average(cos_dis)
    cos_var = numpy.var(cos_dis)
    return cos_avg, cos_var
コード例 #13
0
ファイル: calculate.py プロジェクト: trunganhvu/Do_An
def cosine_similarity(X: pd.DataFrame, y: pd.DataFrame=None):
    """ Độ đo cosin giữa các hàng của mt1 với các hàng của mt2
    """
    if y is None:
        y = np.array(X)
    sim_matrix = pd.DataFrame(
        np.array([[1 - cosine_distance(X.iloc[i], y.iloc[j]) for j in range(len(y))] for i in range(len(X))]))

    if hasattr(X, 'index'):
        sim_matrix.index = X.index
    if hasattr(y, 'index'):
        sim_matrix.columns = y.index

    return sim_matrix
コード例 #14
0
ファイル: test.py プロジェクト: hsiaofongw/cube
    def test_cosine_many_to_many(self):

        xs = np.random.rand(100, 3)
        ys = np.random.rand(120, 3)
        cosines1 = Helper.cosine_many_to_many(xs, ys)

        cosines2 = np.zeros_like(cosines1)
        for i in range(cosines2.shape[0]):
            for j in range(cosines2.shape[1]):
                cosines2[i, j] = 1 - cosine_distance(xs[i, :], ys[j, :])

        errors = cosines1 - cosines2
        max_error = np.max(np.abs(errors))
        tolerance = 1e-6
        self.assertAlmostEqual(max_error, 0.0, delta=tolerance)
コード例 #15
0
    def __init__(self,
                 paragraph_data: List[Dict[str, Argument]],
                 question_tokens: List[Token],
                 embedding_for_token_similarity: Dict[str, numpy.ndarray],
                 distance_threshold: float) -> None:
        self.paragraph_data = paragraph_data
        self.question_tokens = question_tokens
        self._paragraph_strings: Dict[str, List[str]] = defaultdict(list)
        self._paragraph_lemmas: Dict[str, List[str]] = defaultdict(list)
        for structure in paragraph_data:
            for relation, argument in structure.items():
                self._paragraph_strings[argument.argument_string].append(relation)
                self._paragraph_lemmas["_".join(argument.argument_lemmas)].append(relation)
        self._knowledge_graph: KnowledgeGraph = None
        self.paragraph_tokens_to_keep: List[Tuple[str, List[str]]] = []
        if embedding_for_token_similarity is not None:
            # We'll use this word embedding to measure similarity between paragraph tokens and
            # question tokens to decide if we should extract paragraph tokens as entities in the
            # context.
            # Tuples of paragraph strings, list of relation names, and embeddings of paragraph strings.
            paragraph_token_embedding: List[Tuple[str, List[str], numpy.ndarray]] = []
            for paragraph_string, relation_names in self._paragraph_strings.items():
                # paragraph string is already tokenized but has underscores for spaces.
                for token in paragraph_string.split("_"):
                    if token in STOP_WORDS:
                        continue
                    if token not in embedding_for_token_similarity:
                        continue
                    token_embedding = embedding_for_token_similarity[token]
                    paragraph_token_embedding.append((token, relation_names, token_embedding))

            # We keep the embeddings of tokens in the question that are not stop words.
            question_token_embedding: List[numpy.ndarray] = []
            for question_token in question_tokens:
                token_text = question_token.text
                if token_text not in STOP_WORDS and token_text in embedding_for_token_similarity:
                    question_token_embedding.append(embedding_for_token_similarity[token_text])

            if question_token_embedding:
                for paragraph_token, relation_names, token_embedding in paragraph_token_embedding:
                    min_distance = min([cosine_distance(token_embedding, question_embedding) for
                                        question_embedding in question_token_embedding])
                    if 0.0 < min_distance < distance_threshold:
                        # If min_distance is 0.0, it means it is the exact word, and our exact string
                        # match will get it anyway.
                        self.paragraph_tokens_to_keep.append((paragraph_token, relation_names))
コード例 #16
0
def weight_graph(graph, embeddings):
    """
    Weight graph edges by similarity between words
    """

    for edge in graph:
        try:
            embedding_0 = embeddings[edge.words[0].lower()]
        except KeyError:
            embedding_0 = embeddings['UNK']
        try:
            embedding_1 = embeddings[edge.words[1].lower()]
        except KeyError:
            embedding_1 = embeddings['UNK']
        edge.similarity = 1.0 - cosine_distance(embedding_0, embedding_1)

    return graph
コード例 #17
0
def main():
    """ Main function """

    # Building a k-nearest neighbor graph using annoy and cosine distance
    annoy = AnnoyIndex(len(DATA.columns), metric="angular")
    annoy_graph = []

    for i, v in enumerate(DATA.values):
        annoy.add_item(i, v)
    annoy.build(10)

    for i in range(len(DATA)):
        for j in annoy.get_nns_by_item(i, 10):
            annoy_graph.append(
                (i, j, cosine_distance(DATA.values[i], DATA.values[j])))

    # Creating the tmap layout
    x, y, s, t, _ = tm.layout_from_edge_list(len(DATA), annoy_graph)

    faerun = Faerun(view="front", coords=False)
    faerun.add_scatter(
        "MINIBOONE",
        {
            "x": x,
            "y": y,
            "c": LABELS,
            "labels": LABELS
        },
        shader="smoothCircle",
        colormap="Set1",
        point_scale=2.0,
        max_point_size=20,
        has_legend=True,
        categorical=True,
        legend_labels={(0, "Noise"), (1, "Signal")},
    )
    faerun.add_tree(
        "MINIBOONE_tree",
        {
            "from": s,
            "to": t
        },
        point_helper="MINIBOONE",
        color="#666666",
    )
    faerun.plot("miniboone", template="default")
コード例 #18
0
    def correlate_distances(frames, do_plot=False):
        dist_matrix = []
        dist_nonself_matrix = {}
        dist_dense_matrix = []
        index = 0

        for anchor_frame in frames:
            dist_matrix.append([])
            dist_nonself_matrix[anchor_frame] = {}
            dist_dense_matrix.append([])
            for compare_frame in frames:
                af = Util.features[anchor_frame]
                bf = Util.features[compare_frame]
                dist = cosine_distance(af, bf)
                dist_matrix[index].append(dist)

                # print('{0:s} <-> {1:s} = {2:.2f}'.format(anchor_frame, compare_frame, dist))

                if anchor_frame != compare_frame:
                    dist_dense_matrix[index].append(dist)
                    dist_nonself_matrix[anchor_frame][compare_frame] = dist

            index = index + 1

        if do_plot:
            column_labels = frames
            row_labels = frames
            fig, axes = plt.subplots()
            heatmap = axes.pcolor(dist_matrix, cmap=plt.cm.Blues, alpha=0.8)
            fig = plt.gcf()
            fig.set_size_inches(8, 11)
            axes.set_yticks(np.arange(len(dist_matrix)) + 0.5, minor=False)
            axes.set_xticks(np.arange(len(dist_matrix)) + 0.5, minor=False)

            axes.set_xticklabels(frames, minor=False)
            axes.set_yticklabels(frames, minor=False)
            plt.xticks(rotation=90)
            plt.title('Keyframes Cosine Distance Correlation')

        return dist_dense_matrix, dist_nonself_matrix
コード例 #19
0
 def compute_distance(self):
     si_0 = Util.stringify(self.current_frame - 1)
     si_1 = Util.stringify(self.current_frame)
     return cosine_distance(self.features[si_0], self.features[si_1])
コード例 #20
0
ファイル: optics.py プロジェクト: ginayuan/deeptracker
 def distance(self, point):
     return cosine_distance(self.vec, point.vec)
コード例 #21
0
ファイル: fasttext_sim.py プロジェクト: eugene/gyldendal
        print("text: '", text, "' does not have a correct FFM. Skipped.")
        continue

    # text_vectors  = list(map(lambda word: model[word], clean(text).split()))
    texts[text] = {
        "text": text,
        "skill": ffm_skill,
        "knowledge": ffm_knowledge,
        "vector": model[clean(text)]  #np.sum(text_vectors, axis=0)
    }

for text, text_value in texts.items():
    text_value["distance"] = {}

    for ffm_text, ffm_value in ffms.items():
        distance = cosine_distance(text_value["vector"], ffm_value["vector"])
        text_value["distance"][str(distance)] = {
            "ffm": ffm_text,
            "match": (normalize(text_value["skill"]) == normalize(ffm_text))
        }

offline_accuracy = []
for i in range(len(ffms)):
    a = 0
    for k, v in texts.items():
        distances = OrderedDict(
            sorted(v["distance"].items(), key=lambda item: item[0]))
        matches = [m["match"] for m in [v for v in distances.values()]]
        a += np.sum(matches[0:(i + 1)])
    offline_accuracy.append(a / len(texts))
コード例 #22
0
ファイル: helper.py プロジェクト: hsiaofongw/cube
 def cosine(cls, x: np.ndarray, y: np.ndarray) -> float:
     return 1 - cosine_distance(x, y)
コード例 #23
0
ファイル: lnd.py プロジェクト: strategist922/LSCDetection
def main():
    """
    Compute local neighborhood distance for target pairs from two vector spaces.
    """

    # Get the arguments
    args = docopt(
        """Compute local neighborhood distance for target pairs from two vector spaces.

    Usage:
        lnd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath> <k>

        <testset> = path to file with tab-separated word pairs
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath> = output path for result file
        <k> = parameter k (k nearest neighbors)

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    testset = args['<testset>']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    #logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,})
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space1 = Space(matrixPath1, format='npz')
    except ValueError:
        space1 = Space(matrixPath1, format='w2v')
    try:
        space2 = Space(matrixPath2, format='npz')
    except ValueError:
        space2 = Space(matrixPath2, format='w2v')

    matrix1 = space1.matrix
    row2id1 = space1.row2id
    id2row1 = space1.id2row
    matrix2 = space2.matrix
    row2id2 = space2.row2id
    id2row2 = space2.id2row

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1])
                   for line in f_in]

    nbrs1 = NearestNeighbors(n_neighbors=k, metric='cosine',
                             algorithm='brute').fit(matrix1)
    nbrs2 = NearestNeighbors(n_neighbors=k, metric='cosine',
                             algorithm='brute').fit(matrix2)

    scores = {}
    neighborUnionSizes = {}
    for (t1, t2) in targets:

        # Get nearest neighbors
        try:
            index1 = row2id1[t1]
            index2 = row2id2[t2]
        except KeyError:
            scores[(t1, t2)] = 'nan'
            neighborUnionSizes[(t1, t2)] = 'nan'
            continue

        v1 = matrix1[index1].toarray().flatten()
        v2 = matrix2[index2].toarray().flatten()

        distances1, indices1 = nbrs1.kneighbors(matrix1[index1])
        distances2, indices2 = nbrs2.kneighbors(matrix2[index2])

        neighbors1 = list(
            zip([id2row1[i] for i in indices1.flatten().tolist()],
                distances1.flatten().tolist()))
        neighbors2 = list(
            zip([id2row2[i] for i in indices2.flatten().tolist()],
                distances2.flatten().tolist()))

        neighborUnion = sorted(
            list(
                set([
                    a for (a, b) in neighbors1 + neighbors2
                    if (a in row2id1 and a in row2id2 and not a in [t1, t2])
                ])))

        # Filter out vectors with 0-length in either matrix
        neighborUnion = [
            a for a in neighborUnion if (len(matrix1[row2id1[a]].data) > 0
                                         and len(matrix2[row2id2[a]].data) > 0)
        ]

        simVec1 = [
            1.0 - cosine_distance(matrix1[index1].toarray().flatten(),
                                  matrix1[row2id1[n]].toarray().flatten())
            for n in neighborUnion
        ]
        simVec2 = [
            1.0 - cosine_distance(matrix2[index2].toarray().flatten(),
                                  matrix2[row2id2[n]].toarray().flatten())
            for n in neighborUnion
        ]

        # Compute cosine distance of vectors
        distance = cosine_distance(simVec1, simVec2)
        scores[(t1, t2)] = distance
        neighborUnionSizes[(t1, t2)] = len(neighborUnion)

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                f_out.write('\t'.join(
                    (t1, str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))
            elif is_scd:  # output only second target string
                f_out.write('\t'.join(
                    (t2, str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))
            else:  # standard outputs both target strings
                f_out.write('\t'.join(
                    ('%s,%s' % (t1, t2), str(scores[(t1, t2)]),
                     str(neighborUnionSizes[(t1, t2)]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
コード例 #24
0
ファイル: cd.py プロジェクト: LingshuHu/TRIPY
def main():
    """
    Compute cosine distance for targets in two matrices.
    """

    # Get the arguments
    args = docopt("""Compute cosine distance for targets in two matrices.

    Usage:
        cd.py [(-f | -s)] <testset> <matrixPath1> <matrixPath2> <outPath>

        <testset> = path to file with tab-separated word pairs
        <matrixPath1> = path to matrix1
        <matrixPath2> = path to matrix2
        <outPath> = output path for result file

    Options:
        -f, --fst   write only first target in output file
        -s, --scd   write only second target in output file

     Note:
         Important: spaces must be already aligned (columns in same order)! Targets in first/second column of testset are computed from matrix1/matrix2.
        
    """)

    is_fst = args['--fst']
    is_scd = args['--scd']
    testset = args['<testset>']
    matrixPath1 = args['<matrixPath1>']
    matrixPath2 = args['<matrixPath2>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load matrices and rows
    try:
        space1 = Space(matrixPath1, format='npz')
    except ValueError:
        space1 = Space(matrixPath1, format='w2v')
    try:
        space2 = Space(matrixPath2, format='npz')
    except ValueError:
        space2 = Space(matrixPath2, format='w2v')

    matrix1 = space1.matrix
    row2id1 = space1.row2id
    matrix2 = space2.matrix
    row2id2 = space2.row2id

    # Load targets
    with open(testset, 'r', encoding='utf-8') as f_in:
        targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1])
                   for line in f_in]

    scores = {}
    for (t1, t2) in targets:

        # Get row vectors
        try:
            v1 = matrix1[row2id1[t1]].toarray().flatten()
            v2 = matrix2[row2id2[t2]].toarray().flatten()
        except KeyError:
            scores[(t1, t2)] = 'nan'
            continue

        # Compute cosine distance of vectors
        distance = cosine_distance(v1, v2)
        scores[(t1, t2)] = distance

    with open(outPath, 'w', encoding='utf-8') as f_out:
        for (t1, t2) in targets:
            if is_fst:  # output only first target string
                f_out.write('\t'.join((t1, str(scores[(t1, t2)]) + '\n')))
            elif is_scd:  # output only second target string
                f_out.write('\t'.join((t2, str(scores[(t1, t2)]) + '\n')))
            else:  # standard outputs both target strings
                f_out.write('\t'.join(
                    ('%s,%s' % (t1, t2), str(scores[(t1, t2)]) + '\n')))

    logging.info("--- %s seconds ---" % (time.time() - start_time))
コード例 #25
0
    argparser = get_argparser()
    args = argparser.parse_args()
    modelname = args.model

    print('Loading BERT model...')
    encoder = WrappedBERTEncoder(model=modelname, tokenizer=modelname)

    print('BERT Contextual Similarities')
    sentence1 = input('Sentence 1? ')
    sentence2 = input('Sentence 2? ')

    sentences_embeddings, embeddings, tokenized_texts = encoder.encode_sentences(
        [sentence1, sentence2])

    # sentence similarity
    sentence_similarity = 1 - cosine_distance(sentences_embeddings[0],
                                              sentences_embeddings[1])
    print('Cosine similarity between two sentences: {}'.format(
        sentence_similarity))

    # token similarities
    simmatrix = np.zeros((len(tokenized_texts[0]), len(tokenized_texts[1])))
    for i, j in product(range(len(tokenized_texts[0])),
                        range(len(tokenized_texts[1]))):
        simmatrix[i,
                  j] = 1 - cosine_distance(embeddings[0, i], embeddings[1, j])
    simdf = pd.DataFrame(simmatrix)
    simdf.columns = tokenized_texts[1]
    simdf.index = tokenized_texts[0]
    print(simdf)
コード例 #26
0
def cosine_similarity(a, b):
    return 1.0 - cosine_distance(a, b)
コード例 #27
0
def cosine_similarity(v1, v2):
    return 1 - cosine_distance(v1, v2)
コード例 #28
0
def main():
    """ Main function """
    data = []
    time = []
    for path in PATHS:
        sample = fk.Sample(path)
        data.append(load_data(sample))
        time.append(load_time(sample))

    sources = []
    for i, e in enumerate(data):
        sources.extend([i] * len(e))

    data = np.concatenate(data, axis=0)
    time = np.concatenate(time, axis=0)

    d = len(data[0])

    # Initialize a new Annoy object and index it using 10 trees
    annoy = AnnoyIndex(d, metric="angular")
    for i, v in enumerate(data):
        annoy.add_item(i, v)
    annoy.build(10)

    # Create the k-nearest neighbor graph (k = 10)
    edge_list = []
    for i in range(len(data)):
        for j in annoy.get_nns_by_item(i, 10):
            edge_list.append((i, j, cosine_distance(data[i], data[j])))

    # Compute the layout from the edge list
    x, y, s, t, _ = tm.layout_from_edge_list(len(data), edge_list)

    legend_labels = [(0, "No Target Probe Negative Control"),
                     (1, "Stained Sample")]

    # Create the plot
    faerun = Faerun(
        view="front",
        coords=False,
        legend_title=
        "RNA Flow Cytometry: evaluation of detection sensitivity in low abundant intracellular RNA ",
    )
    faerun.add_scatter(
        "CYTO",
        {
            "x": x,
            "y": y,
            "c": sources,
            "labels": sources
        },
        point_scale=1.0,
        max_point_size=10,
        shader="smoothCircle",
        colormap="Set1",
        has_legend=True,
        categorical=True,
        legend_labels=legend_labels,
        legend_title="Cell Types",
    )
    faerun.add_tree("CYTO_tree", {
        "from": s,
        "to": t
    },
                    point_helper="CYTO",
                    color="#222222")

    faerun.plot("cyto")
コード例 #29
0
def text_only_main(image_text_string, ings_dict, definitions_dict):

    # # define ingredient embedddings file
    # ings_dict_fp = '/Volumes/ja2/vegan/vegan_parser/data_source/ingredient_dictionary.p'
    #
    # try:
    #     ings_dict = pickle.load(open(ings_dict_fp, "rb"))
    # except FileNotFoundError:
    #     "Ingredients data not found!"

    cleaned_ings_list = create_ingredients_list(image_text_string)
    print(f'cleaned_ings_list: {cleaned_ings_list}')
    print()

    # Look up the data and pull out the relevant ingredients.
    # how should that be stored... list of dictionaries?

    ings_dict_list = []

    for ing in cleaned_ings_list:
        print(f'Working on ingredient: {ing}')

        # look up ing in the dict
        ing_entry: Dict = ings_dict.get(ing)


        # if entry exists
        if ing_entry:

            # check if word vectors exist & create if needed, or just get them

            # print(ing_entry)

            if ings_dict[ing].get('ewg_vector') is None:
                try:
                    # use the chemical_about phrase to create a document vector
                    phrase = ings_dict[ing].get('chemical_about')
                    if phrase:
                        ewg_docvec = phrase_to_docvec(phrase, doc_embedding)
                    else:
                        ewg_docvec = None

                    ings_dict = add_to_embedding_dict(ing, ings_dict, 'ewg', ewg_docvec)
                    ings_dict[ing]['ewg_vector'] = ewg_docvec
                except:
                    ewg_docvec = None
            else:
                # get the document vector if exists
                ewg_docvec: Union[None, np.array] = ings_dict[ing].get('ewg_vector')

            ing_entry['ewg_vector'] = ewg_docvec

            if ings_dict[ing].get('wiki_vector') is None:
                # ing is in dict but does not have wiki_vector
                try:
                    # use the wikipedia entry to create a document vector
                    phrase = get_wiki_phrase(ing)
                    if phrase:
                        wiki_docvec = phrase_to_docvec(phrase, doc_embedding)
                    else:
                        wiki_docvec = None
                except:
                    wiki_docvec = None

                ings_dict = add_to_embedding_dict(ing, ings_dict, 'wiki', wiki_docvec)
                ings_dict[ing]['wiki_vector'] = wiki_docvec

            else:
                # wiki_vector exists
                wiki_docvec: Union[None, np.array] = ings_dict.get('wiki_vector')

            ing_entry['wiki_vector'] = wiki_docvec

            # Now compare the ingredient against the vectors and find closest definition
            if (ing_entry.get('wiki_vector') is None) and (ing_entry.get('ewg_vector') is not None):
                vector = ing_entry.get('ewg_vector')
            elif (ing_entry.get('wiki_vector') is not None) and (ing_entry.get('ewg_vector') is None):
                vector = ing_entry.get('wiki_vector')
            elif (ing_entry.get('wiki_vector') is not None) and (ing_entry.get('ewg_vector') is not None):
                vector = np.mean([ing_entry.get('wiki_vector'), ing_entry.get('ewg_vector')], axis=0)
            else:
                vector = None

            # Then take the cosine_similarity with all 4 reference vectors to get class

            prev_min_key = None
            prev_min_distance = None

            for key in definitions_dict.keys():
                print(key)
                print(cosine_distance(definitions_dict[key], vector))

                if prev_min_key is None:
                    prev_min_key = key
                    min_key = key
                    prev_min_distance = cosine_distance(definitions_dict[key], vector)

                else:
                    if cosine_distance(definitions_dict[key], vector) < prev_min_distance:
                        min_key = key
                        min_dist = cosine_distance(definitions_dict[key], vector)
                        prev_min_distance = min_dist
                        # prev_min_key = key

            # And add the type to the ingredient
            ing_entry['type'] = min_key.split('_')[0]  # because of the way I designed the names of the dictionary

            # add to entry for this ing
            ings_dict_list.append(ing_entry)

        # entry does not exist in dict for whatever reason
        else:
            # ings_dict_list.append('DID NOT FIND')
            ings_dict_list.append(None)

    # ings_dict_list is ready, now need to combine the data...somehow
    # print(f'ings_dict_list: {ings_dict_list}')


    return cleaned_ings_list, ings_dict_list