def url_similarity(url1, url2):
    result = 0
    if "://github.com" not in url1 and "://github.com" not in url2:
        url1_main_part = extract_url_main_part(url1)
        url2_main_part = extract_url_main_part(url2)
        result = textdistance.jaccard(url1_main_part, url2_main_part)
    elif "://github.com" in url1 and "://github.com" not in url2:
        url1_main_part = get_name_by_github_url(url1)
        #url2_main_part = extract_url_main_part(url2)
        # print "url1: ", url1, " url1_main: ", url1_main_part, " url2: ", url2, " url2_main: ", url2
        lcsstr = textdistance.lcsstr(url1_main_part, url2)
        min_len = min(len(url1_main_part), len(url2))
        if min_len == 0:
            min_len = 1
        result = len(lcsstr) * 1.0 / min_len
    elif "://github.com" not in url1 and "://github.com" in url2:
        #url1_main_part = extract_url_main_part(url1)
        url2_main_part = get_name_by_github_url(url2)
        # print "url1: ", url1, " url1_main: ", url1, " url2: ", url2, " url2_main: ", url2_main_part
        lcsstr = textdistance.lcsstr(url1, url2_main_part)
        min_len = min(len(url1), len(url2_main_part))
        if min_len == 0:
            min_len = 1
        result = len(lcsstr) * 1.0 / min_len
    else:
        url1_main_part = get_name_by_github_url(url1)
        url2_main_part = get_name_by_github_url(url2)
        result = textdistance.jaccard(url1_main_part, url2_main_part)
    return result
Ejemplo n.º 2
0
def jaccard_euclidean(df, sample, num=15):
    vectors = np.zeros((len(df), len(sample)))
    start = time()
    checkpoint = time()
    for idx, row in df.iterrows():
        if idx % 10000 == 0:
            print('Extracting sample ' + str(idx) + '/' + str(len(df)))
            print('Time pass: '******'{:.1f}'.format(time() - start) +
                  '. Time left: ' + '{:.1f}'.format((time() - checkpoint) *
                                                    (len(df) - idx) / 10000))
            checkpoint = time()
        vectors[idx] = np.asarray(
            [tdc.jaccard(row[key], sample[key]) for key in sample.keys()])

    param1 = np.asarray([
        7 / 66, 5 / 66, 11 / 66, 1 / 66, 3 / 66, 4 / 66, 6 / 66, 8 / 66,
        9 / 66, 10 / 66, 2 / 66
    ])
    param2 = np.asarray([
        10 / 64, 5 / 64, 16 / 64, 1 / 64, 3 / 64, 4 / 64, 6 / 64, 3 / 64,
        4 / 64, 10 / 64, 2 / 64
    ])

    vectors = vectors * param1
    score = np.linalg.norm(vectors, axis=1) / np.linalg.norm(param1)
    #    score /= np.max(score)
    score_sorted = np.flip(np.sort(score)[-num:], axis=0)
    indices = np.flip(np.argsort(score)[-num:], axis=0)
    most_similar = df.loc[indices, :]
    most_similar['SCORE'] = score_sorted
    return most_similar
Ejemplo n.º 3
0
def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new
Ejemplo n.º 4
0
def compare_lines2headings(lines, headings):
    if headings.shape[0] == 0:
        print('Headings are empty')
        return np.zeros(len(lines)), np.zeros(len(lines)), np.zeros(len(lines))
    max_similarities = []
    for line in lines:
        ln_similarities = []
        ln_words = line.lower().split()
        for i, heading in headings.iterrows(
        ):  # save info whether the best comparison is to a heading or subheading
            hd = heading.Text.lower()
            hd, _ = heading_id_toc.split_pagenum(hd)
            hd_words = hd.split()
            # compare words
            similarity = textdistance.jaccard(
                ln_words, hd_words
            )  # similarity defined by: intersection/union. see "jaccard index"
            ln_similarities.append([similarity, heading.Heading, i])
        max = np.array(ln_similarities)[:, 0].argmax()
        bestsim = ln_similarities[max]
        if bestsim[0] == 0:  # if basically find no similarity
            bestsim = np.array([0, 0, 0])
        max_similarities.append(bestsim)
    max_similarities = np.array(max_similarities)
    return max_similarities[:,
                            0], max_similarities[:,
                                                 1], max_similarities[:,
                                                                      2]  # return similarity,type matched, and i of heading matched
    def search(self, word, limit=0.7):
        """[Pesquisa palavra próxima no vocabulário utilizando um valor de aceitação de distancia.]

        Arguments:
            word {[string]} -- [Palavra de entrada]

        Keyword Arguments:
            limit {float} -- [Valor de distancia para aceitação de palavra (0,1)] (default: {0.7})

        Returns:
            [new_embed] -- []
        """
        dist = list()
        for index, token in enumerate(self.words):
            if self.algorithm == "levenshtein":
                distance = textdistance.levenshtein.normalized_similarity(
                    word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
            elif self.algorithm == "jaccard":
                distance = textdistance.jaccard(word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
            elif self.algorithm == "ratcliff_obershelp":
                distance = textdistance.ratcliff_obershelp(word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
        index = dist.index(max(dist))
        return self.embeds[index]
Ejemplo n.º 6
0
def similarity(line1, line2):
    """
    return a number
    """
    distance = textdistance.jaccard(line1, line2)
    closeness = 1 - 1 / (1 + math.exp(-30 * (distance - 0.5)))  # Sigmoid mapping (0,1) to (0,1)
    return closeness
    def knn_search(self, word):
        """[Pesquisa uma palavra próxima no vocabulário utilizado a lógica do KNN]

        Arguments:
            word {[string]} -- [Palavra de entrada]

        Returns:
            [int] -- [Índice da palava mais próxima no vocabulário]
        """
        dist = list()
        for token in self.vocabulary:
            if self.algorithm == "levenshtein":
                dist.append(
                    textdistance.levenshtein.normalized_similarity(
                        word, token))

            elif self.algorithm == "jaccard":
                dist.append(textdistance.jaccard(word, token))

            elif self.algorithm == "ratcliff_obershelp":
                dist.append(textdistance.ratcliff_obershelp(word, token))

        index = dist.index(max(dist))

        return self.embeds[index]
def calculate_similarity_matrix(selected_data_list, vector_map, weight,
                                current_index, total_length):
    full_result = []
    lack_result = []
    if selected_data_list is not None:
        # print("there are total " + str(len(selected_data_list)) + " data.")
        for i in range(0, len(selected_data_list)):
            # print("execute the NO." + str(current_index) + " list now, there are total " + str(total_length) + " list.")
            # print("calculate the NO." + str(i) + " data's similarity, " + "there are total " + str(len(selected_data_list)) + " data.")
            first = str(selected_data_list[i]["paragraph_id"])
            first_noun_phrase = selected_data_list[i]["noun_phrase"]
            full_temp = []
            lack_temp = []
            for j in range(0, len(selected_data_list)):
                second = str(selected_data_list[j]["paragraph_id"])
                second_noun_phrase = selected_data_list[j]["noun_phrase"]
                if i == j:
                    similarity = 0
                else:
                    similarity = (vector_map.similarity(first, second) +
                                  1) * weight / 2 + textdistance.jaccard(
                                      first_noun_phrase,
                                      second_noun_phrase) * (1 - weight)
                    similarity = round(similarity, 6) - 1
                    lack_temp.append(similarity)
                full_temp.append(similarity)
            full_result.append(full_temp)
            lack_result.append(lack_temp)
    return full_result, lack_result
Ejemplo n.º 9
0
def Jaccard_Index(Cell1, Cell2):
    Cell1 = Cell1.lower().split(
    )  #removes punctuation and spilts string into list of words
    Cell2 = Cell2.lower().split(
    )  #removes punctuation and spilts string into list of words

    return jaccard(Cell1, Cell2)  #calculates jaccard index of two strings
Ejemplo n.º 10
0
    def extract_by_similarity(self,
                              concepts,
                              sentences,
                              alpha=0.75,
                              top_k=3,
                              min_sim=0.5):
        alias2concept = {}
        for concept in concepts:
            for alias in concept.aliases:
                alias2concept[alias.lower()] = concept
        terms = set(alias2concept.keys())

        term2sents = dict()
        for sent in sentences:
            rs = sent.find_spans(*terms, ignore_case=True)
            for term in rs.keys():
                if term not in term2sents:
                    term2sents[term] = set()
                term2sents[term].add(sent)

        term2emb = {
            term: sum(sent.emb() for sent in sents) / len(sents)
            for term, sents in term2sents.items()
        }
        concept2emb = {
            concept: sum(term2emb[alias.lower()] for alias in concept.aliases)
            for concept in concepts
        }

        def __cosine(vector1, vector2):
            norm = (np.linalg.norm(vector1) * np.linalg.norm(vector2))
            if norm == 0:
                return 0
            cos = np.dot(vector1, vector2) / norm
            return 0.5 + 0.5 * cos

        start2scores = dict()
        for start, end in itertools.combinations(concepts, 2):
            cos = __cosine(concept2emb[start], concept2emb[end])
            start_words = set()
            for alias in start.aliases:
                start_words.update(self.name_handler.normalize(alias).split())
            end_words = set()
            for alias in end.aliases:
                end_words.update(self.name_handler.normalize(alias).split())
            jaccard = textdistance.jaccard(start_words, end_words)
            score = alpha * cos + (1 - alpha) * jaccard
            if start not in start2scores:
                start2scores[start] = set()
            start2scores[start].add((end, score))

        relations = set()
        for start, pairs in start2scores.items():
            pairs = list(sorted(pairs, key=lambda item: item[1], reverse=True))
            for end, score in pairs[:top_k]:
                if score < min_sim:
                    break
                relations.add(Relation(start, end, RelType.RELATED_TO))
        return relations
Ejemplo n.º 11
0
 def criteria_features(x, col):
     raw_text = x[col].lower()
     jd = jellyfish.jaro_distance(raw_text, crit)
     ld = jellyfish.levenshtein_distance(raw_text, crit)
     ji = textdistance.jaccard(raw_text, crit)
     sd = textdistance.sorensen(raw_text, crit)
     ro = textdistance.ratcliff_obershelp(raw_text, crit)
     return jd, ld, ji, sd, ro
Ejemplo n.º 12
0
def similarity(phrase1, phrase2):
    s1 = ""
    s2 = ""
    for note in phrase1:
        s1 += chr(note)
    for note in phrase2:
        s2 += chr(note)
    return jaccard(s1, s2)
Ejemplo n.º 13
0
def prova():
    l = sorted(rf.keys())
    dm = get_entry(l)
    joint = {}
    for word in dm:
        joint[word] = [
            td.jaccard(rf[word].split(), dm[word].split()), rf[word], dm[word]
        ]
    return joint
Ejemplo n.º 14
0
    def getSimilarity(self, desired, given):
        # If the given string contains desired as a substring
        if desired in given:
            return 0.9
        # Strings have to be split up into character arrays for this algorithm
        desired_attr = [char for char in desired]
        given_attr = [char for char in given]

        # Returns a number representing how similar the two strings are
        return textdistance.jaccard(desired_attr, given_attr)
Ejemplo n.º 15
0
def map_marie(input_data,
              target_data,
              input_bert_weights,
              target_bert_weights,
              string_match='edit',
              alpha=0.8,
              bert_layers=1,
              top_n=5):
    inp_txt2idx, inp_idx2txt = _create_txt2idx(input_data)
    tgt_txt2idx, tgt_idx2txt = _create_txt2idx(target_data)

    inp_bert_vectors = _get_bert_vectors(input_bert_weights, input_data,
                                         inp_txt2idx, bert_layers)
    tgt_bert_vectors = _get_bert_vectors(target_bert_weights, target_data,
                                         tgt_txt2idx, bert_layers)

    mapper = dict()

    for cnt, (inp_txt, inp_idx) in enumerate(inp_txt2idx.items()):
        inp_bert_vector = inp_bert_vectors[inp_idx]
        cal_dist = []

        cos_dist = [
            alpha * (_cal_cosine(inp_bert_vector, tgt_bert_vectors[tgt_idx]))
            for tgt_txt, tgt_idx in tgt_txt2idx.items()
        ]

        if string_match == 'edit':
            str_match = [
                (1 - alpha) * (1 - editdistance.eval(inp_txt, tgt_txt) /
                               max(len(inp_txt), len(tgt_txt)))
                for tgt_txt, tgt_idx in tgt_txt2idx.items()
            ]
        if string_match == 'jaccard':
            str_match = [(1 - alpha) * textdistance.jaccard(inp_txt, tgt_txt)
                         for tgt_txt, tgt_idx in tgt_txt2idx.items()]
        if string_match == 'ob':
            str_match = [(1 - alpha) *
                         textdistance.ratcliff_obershelp(inp_txt, tgt_txt)
                         for tgt_txt, tgt_idx in tgt_txt2idx.items()]

        ord2idx = [tgt_idx for _, tgt_idx in tgt_txt2idx.items()]

        cal_dist = np.add(cos_dist, str_match)
        topn_ord_idx = cal_dist.argsort()[::-1][:top_n]

        mapper[inp_idx] = [(ord2idx[idx], cal_dist[idx])
                           for idx in topn_ord_idx]

        if cnt % 100 == 0:
            print("...Processed %i mappings" % (cnt))

    return mapper, inp_idx2txt, tgt_idx2txt
Ejemplo n.º 16
0
 def answer(self, msg):
     if not self.active or self.correct:
         return
     if not self.correct:
         for ans in self.a:
             ans = " ".join(ans.split()).strip().lower()
             guess = " ".join(msg.args[1].split()).strip().lower()
             if guess == ans:
                 self.correct = True
                 break
             elif not self.correct:
                 answer = self.clean(ans)
                 guess = self.clean(guess)
             if not self.correct and guess == answer:
                 self.correct = True
                 break
             elif (not self.correct and self.flexibility < 1
                   and self.flexibility > 0.5):
                 dist = textdistance.jaro_winkler(guess, answer)
                 log.debug(
                     "Jeopardy: guess: {0}, answer: {1}, length: {2}, "
                     "distance: {3}, flexibility: {4}".format(
                         guess, answer, len(answer), dist,
                         self.flexibility))
                 if dist >= self.flexibility:
                     self.correct = True
                     break
                 elif (dist < self.flexibility and "," in self.a[0]
                       or "&" in self.a[0]):
                     dist = textdistance.jaccard(guess, answer)
                     if dist >= self.flexibility:
                         self.correct = True
                         break
         if self.correct:
             if not msg.nick in self.scores:
                 self.scores[msg.nick] = 0
             self.scores[msg.nick] += self.p
             if not msg.nick in self.roundscores:
                 self.roundscores[msg.nick] = 0
             self.roundscores[msg.nick] += self.p
             self.unanswered = 0
             reply = self.correct_template.render(
                 nick=msg.nick,
                 answer=self.a[0],
                 points=self.p,
                 round=self.roundscores[msg.nick],
                 total=self.scores[msg.nick],
             )
             self.reply(reply)
             self.correct = True
             self.answered += 1
             self.clear()
             self.newquestion()
Ejemplo n.º 17
0
def check_match(data):
    gr_row_no = data["gr_row_no"]
    gr_row = data["gr_row"]
    lib_rows = data["lib_rows"]
    score = int(data["score"])
    match_mode = data["match_mode"].upper().strip()

    matches = []
    for idx, lr in enumerate(lib_rows):
        auth_score = int(TD.jaccard(gr_row["auth_tok"], lr["auth_tok"]) * 100)
        title_score = int(
            TD.jaccard(gr_row["title_tok"], lr["title_tok"]) * 100)
        gr_comb = gr_row["auth_tok"] + gr_row["title_tok"]
        lr_comb = lr["auth_tok"] + lr["title_tok"]
        total_score = int(TD.jaccard(gr_comb, lr_comb) * 100)
        cond = False

        if match_mode == "TA":
            cond = total_score > score
        elif match_mode == "T":
            cond = title_score > score
        elif match_mode == "TTA":
            cond = title_score > score or total_score > score
        elif match_mode == "A":
            cond = auth_score > score
        else:
            raise Exception("Unsupported matching condition: " + match_mode)

        if cond:
            row = dict(
                zip(OUT_CSV_HEADER, [
                    gr_row_no, gr_row["author"], gr_row["title"], idx,
                    lr['author'], lr['title'], auth_score, title_score,
                    total_score
                ]))
            matches.append(row)

    return matches
    def test_similarity_calculation(self):
        str1 = "AbstractInputMethodService provides a abstract base class for inut methods."
        str2 = "The default implementation in this abstract class returns 1.0 for all components."

        vector_map = EntityVectorModel.load(
            "mean_vector_api_paragraph.plain.txt", binary=False)

        vector1 = vector_map.compute_mean_vector(str1)
        vector2 = vector_map.compute_mean_vector(str2)
        semantic_similarity = dot(matutils.unitvec(vector1),
                                  matutils.unitvec(vector2))
        print("semantic similarity is " + semantic_similarity)
        structure_similarity = textdistance.jaccard(str1, str2)
        print("structure similarity is " + structure_similarity)
Ejemplo n.º 19
0
 def sm_features(x, col1, col2):
     if (x[col1] != x[col1]) or (x[col2] != x[col2]):
         jd = np.nan
         ld = np.nan
         ji = np.nan
         sd = np.nan
     else:
         raw_text1 = x[col1].lower()
         raw_text2 = x[col2].lower()
         jd = jellyfish.jaro_distance(raw_text1, raw_text2)
         ld = jellyfish.levenshtein_distance(raw_text1, raw_text2)
         ji = textdistance.jaccard(raw_text1, raw_text2)
         sd = textdistance.sorensen(raw_text1, raw_text2)
     return jd, ld, ji, sd
Ejemplo n.º 20
0
def match_jaccard(ee, platforms):
    for index, row in ee.iterrows():
        if row.possible_stops == '':
            subset = platforms[platforms.routes_wkd.str.contains(row.line)]
            if subset.shape[0] > 0:
                subset_stop_names = pd.DataFrame(subset.stop_name.unique(),columns=['stop_name'])
                name_dist = [textdistance.jaccard(row.station_name,y) for y in subset_stop_names.stop_name]

                matched_station_name = subset_stop_names.iloc[np.argmax(name_dist),0]
                matched_stop_ids = subset[subset.stop_name == matched_station_name][['stop_id']]
                score = max(name_dist)
                if score >=0.8:
                    ee.loc[index,'possible_stops'] = ', '.join(matched_stop_ids.stop_id)
    return ee
Ejemplo n.º 21
0
def similarity(str1, str2, type):
    """
    Similarity score calculated by either Jaccard or Sorensen methods.
    :param str1: string
    :param str2: string
    :param type: ['jaccard'|'sorensen']
    :return: a float number between 0 (inclusive) and 1 (inclusive)
    """
    tokens_1 = tokenize(str1)
    tokens_2 = tokenize(str2)
    if type == 'jaccard':
        return textdistance.jaccard(tokens_1, tokens_2)
    elif type == 'sorensen':
        return textdistance.sorensen_dice(tokens_1, tokens_2)
    return 0
Ejemplo n.º 22
0
def best_match_bert(orig_title, names_pool):
    '''
    Best matching using bert, some simple text distance measures
    :param title: searched occupation name
    :param names_pool: where are we searching
    :return: best match occupation name from pool
    '''
    scores1 = []
    scores2 = []
    scores3 = []
    scores4 = []
    scores5 = []
    scores6 = []
    scores7 = []
    # 1,2, 8 - scores > 1
    title = pre_process(orig_title)
    bb = BertEncoder()
    l1 = bb.bert_encoder(title)
    for name in names_pool:
        proc_name = pre_process(name)
        l2 = bb.bert_encoder(proc_name)
        l1 = np.array(l1).reshape(-1, 1)
        l2 = np.array(l2).reshape(-1, 1)
        scores1.append(np.sum(euclidean_distances(l1, l2)))
        scores2.append(np.sum(paired_distances(l1, l2)))
        scores3.append(np.sum(cosine(l1, l2)))
        scores4.append(np.sum(naive_metric(title, proc_name)))
        scores5.append(np.sum(textdistance.jaccard(title, proc_name)))
        scores6.append(np.sum(textdistance.sorensen_dice(title, proc_name)))
        scores7.append(
            np.sum(textdistance.damerau_levenshtein(title, proc_name)))
        textdistance.j

    chosen_idx = [
        names_pool[np.argmin(scores1)], names_pool[np.argmin(scores2)],
        names_pool[np.argmax(scores3)], names_pool[np.argmax(scores4)],
        names_pool[np.argmax(scores5)], names_pool[np.argmax(scores6)],
        names_pool[np.argmin(scores7)]
    ]
    print(
        "-----------------------------------------------------------------------------------------"
    )
    print("All title similarity candidates \n", chosen_idx)
    c = Counter(chosen_idx)
    frequency = c.most_common(1)[0][1]
    if frequency < 3:
        return names_pool[np.argmax(scores3)]
    return c.most_common(1)[0][0]
    def compare(self, str1, str2):

        if self.debug:
            self.log("jaccard comparison")

        self.start_time()

        self.result.distance = jaccard(str1, str2)

        self.end_time()

        self.result.nos = max(len(str1), len(str2))
        self.result.threshold = 90
        self.result.similarity = self.result.distance * 100

        return self.result
Ejemplo n.º 24
0
def single_compare(bsl, our, j, label, count, id2vocab, id2response_dict):
    # golden data
    golden_context = [[id2vocab[w.item()] for w in sent if w != 0]
                      for sent in our[0]['context'][j] if not all(sent == 0)]
    golden_query = [id2vocab[w.item()] for w in our[0]['query'][0] if w != 0]
    golden_response = id2response_dict[our[0]['response_id'][j].item()]
    golden_profile = vector2profile(our[0]['profile'][j])
    golden_incomplete_profile = vector2profile(our[0]['incomplete_profile'][j])
    # prediction
    our_pred_response = id2response_dict[our[1]['pred_response_id'][j].item()]
    # our_pred_profile = vector2profile(our[1]['pred_profile'][j])
    our_pred_profile = vector2profile(
        pred_profile_to_onehot_vec(
            our[1]['pred_profile_prob'][j].unsqueeze(0)).squeeze(0))
    our_pred_profile_prob = vector2probdict(our[1]['pred_profile_prob'][j])
    bsl_pred_response = id2response_dict[bsl[1]['pred_response_id'][j].item()]
    # similarity
    # similarity = round(textdistance.hamming.normalized_similarity(tokenizer(bsl_pred_response, type=None), tokenizer(our_pred_response, type=None)), 3)
    similarity = round(
        textdistance.jaccard(tokenizer(bsl_pred_response, type='word'),
                             tokenizer(our_pred_response, type='word')), 3)
    # similarity = round(textdistance.cosine(bsl_pred_response, our_pred_response), 3)
    print_str = ''
    if len(golden_incomplete_profile) < len(golden_profile) and len(
            golden_incomplete_profile) != 0:
        print_str = '%s %s %s\n' % (count, '=' * 50, label)
        print_str += 'CONTEXT:\n'
        for sent in golden_context:  #[-5:]:  # avoid too much context
            print_str += '%s: %s\n' % (''.join(sent[-2:]), ' '.join(sent[:-2]))
            # if sent[-2] == '$kb' and (sent[0] in our_pred_response or sent[0] in bsl_pred_response): # if restaurant mentioned in the response
            #         pass
            # else:
            #     print_str += '%s: %s\n' % (''.join(sent[-2:]), ' '.join(sent[:-2]))
        print_str += 'QUERY: %s\n' % ' '.join(golden_query)
        print_str += 'INCOMPLETE:%s\n COMPLETE:%s\n PRED:%s\n PRED_PROB:%s\n' % (
            golden_incomplete_profile, golden_profile, our_pred_profile,
            our_pred_profile_prob)
        print_str += 'GOLD: %s\n BSL: %s\n OUR: %s\n' % (
            golden_response, bsl_pred_response, our_pred_response)
        print_str += 'SIM: %s\n' % similarity
        print(print_str)
        # if label == 'GOOD':
        #     fw_good.write(print_str)
        # elif label == 'BAD':
        #     fw_bad.write(print_str)
    return similarity, print_str
Ejemplo n.º 25
0
def jaccardDist(name1, name2):
    """
    info: calculate the jaccard distance between the two strings 
        name1, name2
    input: name1:String, name2:String
    output: distance value dist (real number within [0, 1])
    """
    """
    dist = 0
    
    for i in range(len(name1)):
            if not (name1[i] == name2[i]):
                    dist += 1 
    #still have to calculate the real Jaccard distance
    """

    dist = 1 - td.jaccard(name1, name2)
    return dist
Ejemplo n.º 26
0
def compare(s_inp, s_out):
	'''nlp = spacy.load("en_core_web_sm")
	str_inp = nlp(s_inp)
	srt_inp = " ".join([token.lemma_ for token in str_inp])
	#print(str_inp)
	#inp_lower = str_inp.lower()
	print("Lower String Input: {}".format(str_inp))
	str_out = nlp(s_out)
	srt_out = " ".join([token.lemma_ for token in str_out])
	#print(str_out)
	#out_lower = str_out.lower()
	print("Lower String Output: {}".format(str_out))
	#print("Inside Compare")
	#print("Str1: ", s_inp)
	#print("Str2: ", s_out)'''
	

	#Jaccard Index
	jacc = textdistance.jaccard(s_inp, s_out)
	print("jaccard: ", jacc)
	
	#Sorens
	soren = textdistance.sorensen(s_inp, s_out)
	print("Sorensen: ", soren)
	
	#TVR Value
	tvr = textdistance.tversky(s_inp, s_out)
	print("Tversky: ", tvr)
	
	#Over Lap Index
	overlap = textdistance.overlap(s_inp, s_out)
	print("overlap_cofficient: ", overlap)
	
	#Tanimoto Distance
	#tanimoto_distance = textdistance.tanimoto(str_inp, str_out)
	#print("Tanimoto: ", tanimoto_distance)

	res = (jacc+soren+tvr+overlap)/4
	if res == 0:
		pass
	else:
		lst.append(res)
	print("Result: {}".format(res))
	'''if (res >= 0.6):
Ejemplo n.º 27
0
def fuzzy_match(term, term_list):
    best_match = 0
    if term == 'nan':
        return False
    if 'unnamed' in term:
        return False
    for t in term_list:
        match = textdistance.jaccard(term, t)
        #match = textdistance.damerau_levenshtein.normalized_similarity(term, t)
        if match > best_match:
            best_match = match
            if best_match == 1:
                print("Found match for: ", term, " with score of: ",
                      best_match)
                return True  # don't need to keep searching once find exact match
    if best_match >= 0.8:  # if best match is above a threhold for similarity - can modify this number
        print("Found match for: ", term, " with score of: ", best_match)
        return True
    if best_match >= 0.6:
        print("Didn't find match for: ", term, " with score of: ", best_match)
    return False
Ejemplo n.º 28
0
def get_similarity_score(title, videos, movie_description, keywords=None):
    '''
    main method that take the list of video_data from a YouTube query
    and sort the queries based on a set of similarities as defined
    by accord index and description keywords

    Parameters
    ==========
    title:
        the title of the movie
    videos:
        list of youtube video objects
    keywords:
        list of top-5 cast, top-5 characters, and director(s)
    movie_description:
        string of description from tmdb

    Return
    ==========
    list of YouTubeVideo objects with similarity score
    '''

    youtube_videos = []
    for video in videos:
        # first, all items will be converted to lowercase and tokenized
        lowercase_title = title.lower()
        movie_desc = strip_characters(movie_description.lower()).split()
        trailer_desc = strip_characters(video.description.lower()).split()
        movie_title = (strip_characters(lowercase_title) +
                       " official trailer").split()
        trailer_title = strip_characters(video.title.lower()).split()

        similarity_score = is_clip(
            trailer_title, lowercase_title.split()) * 0.5 * (
                textdistance.sorensen_dice(movie_desc, trailer_desc) +
                textdistance.jaccard(movie_title, trailer_title))
        video.set_similarity_score(similarity_score)
        youtube_videos.append(video)

    return youtube_videos
Ejemplo n.º 29
0
def super_similiar(es1, es2, sim_factor=0.8, sim_box=0.6):
    """Check if two elements are super similiar by text (Jaccad) and visually (compare bbox).
    """
    text1 = only_text(es1)
    text2 = only_text(es2)

    points1 = only_points(es1)
    points2 = only_points(es2)

    if min(len(points1), len(points2)) < 4:
        return False

    logger.debug("points")
    logger.debug(points1)
    logger.debug(points2)

    j_sim = jaccard(text1, text2)
    b_sim = sim_bbox(points1, points2)

    logger.debug(f"footer/header sims {j_sim} {b_sim}")

    return j_sim > sim_factor and b_sim > sim_box
Ejemplo n.º 30
0
def calculate_similarity_matrix(selected_data_list, vector_map, weight):
    full_result = []
    id_list = []
    if selected_data_list is not None:
        vector_matrix = calculate_matrix(selected_data_list, vector_map).tolist()
        for i in range(0, len(selected_data_list)):
            first_noun_phrase = selected_data_list[i]["noun_phrase"]
            id_list.append(selected_data_list[i]["id"])
            full_temp = []
            for j in range(0, len(selected_data_list)):
                second_noun_phrase = selected_data_list[j]["noun_phrase"]
                if i == j:
                    similarity = 0
                else:
                    similarity = (vector_matrix[i][j] + 1) * weight / 2 + textdistance.jaccard(
                        first_noun_phrase,
                        second_noun_phrase) * (
                                         1 - weight)
                    similarity = -round(similarity, 6) + 1
                full_temp.append(similarity)
            full_result.append(full_temp)
    return full_result, id_list