def no_longer_match_that_contains_m_in_same_span(self, matches, m, row):
        curr_len = len(m['umls_match'].split(" "))
        for other_m in matches:
            if other_m['curr_occurence_offset'] is None:
                print(f"other_m['curr_occurence_offset'] is None")
                continue
            other_len = len(other_m['umls_match'].split(" "))
            if other_len > curr_len:
                other_m_with_same_len_as_curr = " ".join(
                    other_m['umls_match'].split(" ")[:curr_len])
                if len(other_m['umls_match'].split(" ")
                       [curr_len]) >= 3 and words_similarity(
                           other_m_with_same_len_as_curr,
                           m['umls_match']) > 0.88:
                    other_span_contains_curr = self.does_other_span_contains_curr(
                        m, other_m, row['file_name'])
                    if other_span_contains_curr:
                        return False
                other_m_with_same_len_as_curr = " ".join(
                    other_m['umls_match'].split(" ")[curr_len:])
                if len(other_m['umls_match'].split(" ")
                       [curr_len]) >= 3 and words_similarity(
                           other_m_with_same_len_as_curr,
                           m['umls_match']) > 0.88:
                    other_span_contains_curr = self.does_other_span_contains_curr(
                        m, other_m, row['file_name'])
                    if other_span_contains_curr:
                        return False

        return True
Example #2
0
def get_best_match(relevant_labeled_df, row, final_label_col):
    best_match = None
    best_match_sim = 0
    row_start_offset = row['curr_occurence_offset']
    row_end_offset = row_start_offset + len(row['cand_match'])
    for match_data in relevant_labeled_df[final_label_col]:
        term = match_data['term']
        sim1 = words_similarity(term, row['cand_match'])
        sim2 = words_similarity(term, row['umls_match'])
        higher_sim = max(sim1, sim2)
        if higher_sim > LOW_SIMILARITY_THRESHOLD and spans_match_lemma(match_data, row_start_offset, row_end_offset):
            best_match_sim = higher_sim
            best_match = match_data
    return best_match, best_match_sim
def find_occs_offsets_in_txt(cand_match_occ_in_key_txt, cand_term, post_txt):
    txt_words = post_txt.split(" ")
    cand_match_occ_in_key_txt_len = len(cand_match_occ_in_key_txt.split(" "))
    all_match_occ = []
    for i in range(len(txt_words)):
        relevant_words = txt_words[i:i + cand_match_occ_in_key_txt_len]
        if len(relevant_words) > 1 and (relevant_words[1] == '' or relevant_words[1] == ' ') and len(txt_words) > i + cand_match_occ_in_key_txt_len:
                relevant_words.append(txt_words[i + cand_match_occ_in_key_txt_len])
        relevant_term = " ".join(relevant_words)
        if words_similarity(relevant_term, cand_term) > 0.82 or words_similarity(relevant_term, cand_match_occ_in_key_txt) > 0.82:
            occurence_offset = post_txt.index(relevant_term)
            all_match_occ.append(occurence_offset)

    return all_match_occ
 def are_all_instances_of_term_are_contained(self, m, other_m, row):
     txt_words = row['tokenized_text'].split(" ")
     indexes_with_term = []
     for w_idx, w in enumerate(txt_words):
         if words_similarity(w, m['umls_match']) > SIMILARITY_THRESHOLD:
             indexes_with_term.append(w_idx)
     first_container_w, second_container_w = other_m['umls_match'].split(
         " ")
     all_instances_of_term_are_contained = all(
         words_similarity(txt_words[i + 1], second_container_w) >
         SIMILARITY_THRESHOLD for i in indexes_with_term)
     # if all_instances_of_term_are_contained:
     #     print(f"all_instances_of_term_are_contained: {m['umls_match']}, {other_m['umls_match']}")
     return all_instances_of_term_are_contained
def find_possible_cand_matches_occurence_in_txt_by_lemma(cand_term, row):
    match_occs_in_txt = []
    cand_term_len = len(cand_term.split(" "))
    if cand_term_len == 1:
        for l, w in [x.split(" ") for x in row['words_and_lemmas']]:
            if words_similarity(cand_term, l) > SIMILARITY_THRESHOLD:
                match_occs_in_txt.append(w)
    else:
        for i in range(len(row['words_and_lemmas']) - cand_term_len + 1):
            relevant_words_and_lemmas = row['words_and_lemmas'][i:i + cand_term_len]
            words_cand = " ".join([w.split(" ")[1] for w in relevant_words_and_lemmas])
            lemmas_cand = " ".join([w.split(" ")[0] for w in relevant_words_and_lemmas])
            if words_similarity(lemmas_cand, cand_term) > SIMILARITY_THRESHOLD:
                match_occs_in_txt.append(words_cand)
    return match_occs_in_txt
 def match_not_similar_to_short_match(self, match, all_short_matches):
     similar = False
     for m in all_short_matches:
         if words_similarity(m, match) > SIMILARITY_THRESHOLD:
             similar = True
             break
     return not similar
def find_umls_match_fast(msg_txt, searcher, row, msg_key_lang):
    if msg_txt == "":
        return []

    ngrams = prepare_msg_ngrams(msg_txt)

    all_matches_found = []

    for gram in ngrams:
        for i in range(1, NUMBER_OF_GRAMS + 1):
            low_similarity_threshold = LOW_SINGLE_WORD_SIMILARITY_THRESHOLD if i == 1 else LOW_MULTI_WORD_SIMILARITY_THRESHOLD
            up_similarity_threshold = UP_SINGLE_WORD_SIMILARITY_THRESHOLD if i == 1 else UP_MULTI_WORD_SIMILARITY_THRESHOLD
            cand_term = " ".join(gram[:i])
            if term_is_exception(i, cand_term):
                continue
            search_result = searcher.ranked_search(cand_term, low_similarity_threshold)
            if search_result != []:
                cosine_sim, umls_match = search_result[0]  # Cosine-Sim. I can demand another sim
                sim = words_similarity(umls_match, cand_term)
                if is_good_match(sim, umls_match, i, up_similarity_threshold):
                    all_matches_found = add_match_data(all_matches_found, cand_term, msg_key_lang, row, sim, umls_match)


    all_matches_found_with_full_occs = get_matches_with_full_occs(all_matches_found)
    return all_matches_found_with_full_occs
def find_match_occ_for_cand_term_and_text(cand_term, match_occs_in_txt, tokenized_words_lst):
    cand_term_len = len(cand_term.split(" "))
    for i in range(len(tokenized_words_lst) - cand_term_len + 1):
        relevant_words = tokenized_words_lst[i:i + cand_term_len]
        words_cand = " ".join(relevant_words)
        if words_similarity(words_cand, cand_term) > SIMILARITY_THRESHOLD:
            match_occs_in_txt.append(words_cand)
    return match_occs_in_txt
def try_to_find_with_next_term(all_possible_word_rows, cand_match, df,
                               match_len):
    best_idx_with_next_word = None
    best_sim = 0
    for idx in all_possible_word_rows.index:
        term_with_next_word = " ".join(df['word'].loc[idx:idx + match_len - 1])
        sim = words_similarity(term_with_next_word, cand_match)
        if sim > SIMILARITY_THRESHOLD and sim > best_sim:
            best_sim = sim
            best_idx_with_next_word = idx
    return best_idx_with_next_word
def get_correct_row_from_df(df, cand_match, umls_match, row):
    number_of_match = row['all_match_occ'].index(row['curr_occurence_offset'])
    cand_match_parts = cand_match.split(" ")
    match_len = len(cand_match_parts)
    cand_match_first_term = cand_match_parts[0]
    all_possible_word_rows = df[df['word'].apply(lambda x: words_similarity(
        x, cand_match_first_term) > SIMILARITY_THRESHOLD)]

    if len(row['all_match_occ']) == len(all_possible_word_rows):
        correct_row = all_possible_word_rows.iloc[number_of_match]
    else:
        all_possible_lemma_rows = df[
            df['lemma'].apply(lambda x: words_similarity(
                x, cand_match_first_term) > SIMILARITY_THRESHOLD)]
        if len(row['all_match_occ']) == len(all_possible_lemma_rows):
            correct_row = all_possible_lemma_rows.iloc[number_of_match]
        else:
            try:
                best_idx_with_prefix = try_to_find_with_prefix(
                    all_possible_word_rows, cand_match, df, match_len)
            except Exception as ex:
                best_idx_with_prefix = None
            if best_idx_with_prefix:
                correct_row = df.loc[best_idx_with_prefix]
            else:
                best_idx_with_next_word = try_to_find_with_next_term(
                    all_possible_word_rows, cand_match, df, match_len)
                if best_idx_with_next_word:
                    correct_row = df.loc[best_idx_with_next_word]
                else:
                    global number_of_sorts
                    number_of_sorts += 1
                    df['word_sim_to_cand_match'] = df['word'].apply(
                        lambda x: words_similarity(x, cand_match_first_term))
                    df.sort_values(by='word_sim_to_cand_match',
                                   ascending=False,
                                   inplace=True)
                    correct_row = df.iloc[0]

    return correct_row
Example #11
0
def get_yi_for_cand_match(label_row, row):
    row_start_offset = row['curr_occurence_offset']
    row_end_offset = row['curr_occurence_offset'] + len(row['cand_match'])
    found_label_for_row = False
    best_match = None
    for ann in label_row[FINAL_LABELS_COL]:
        if spans_match(ann, row_start_offset, row_end_offset):
            if words_similarity(ann['term'], row['cand_match']) >= LOW_SIMILARITY_THRESHOLD:
                found_label_for_row = True
                best_match = ann

    yi = 1 if found_label_for_row else 0
    return yi, best_match
Example #12
0
def assert_terms_in_place(row):
    text = row['text']
    annotations = row[FINAL_LABELS_COL]
    predictions = row['prediction_labels']
    local_sim_thresh = 0.72

    for ann in annotations:
        assert words_similarity(text[ann['start_offset']:ann['end_offset']],
                                ann['term']) > local_sim_thresh

    not_inplace_terms = []
    for p in predictions:
        sim = words_similarity(text[p['start_offset']:p['end_offset']],
                               p['term'])
        if sim < local_sim_thresh:
            not_inplace_terms.append(p)
    if len(not_inplace_terms):
        # print(f"not_inplace_terms: {not_inplace_terms}")
        # print(f"anns: {[t['term'] for t in annotations]}")
        predictions = [p for p in predictions if p not in not_inplace_terms]

    return predictions
Example #13
0
def get_items_in_both_lsts(l1, l2):
    in_both_lsts = []
    for ann in l1:
        found_pred = False
        for pred in l2:
            if words_similarity(ann['term'], pred['term']) > 0.8 and abs(
                    ann['start_offset'] - pred['start_offset']) <= 2:
                if ann['label'] == pred['label']:
                    found_pred = True
                    break
        if found_pred:
            in_both_lsts.append(ann)

    return in_both_lsts
    def get_all_occurences_of_match_in_text(self, match, txt_words):

        match_indexes = []
        NUMBER_OF_GRAMS = 3
        len_match = len(match.split(" "))
        ngrams = list(zip(*[txt_words[i:] for i in range(NUMBER_OF_GRAMS)]))
        if len(ngrams) > 0:
            last_gram = ngrams[-1]
            extra_gram = last_gram[1], last_gram[2], 'PAD'
            ngrams.append(extra_gram)
            extra_gram_2 = last_gram[2], 'PAD', 'PAD'
            ngrams.append(extra_gram_2)

        for gram_idx, gram in enumerate(ngrams):
            cand_term = " ".join(gram[:len_match])
            if words_similarity(cand_term, match) > SIMILARITY_THRESHOLD:
                matches_with_idx = " ".join(txt_words[gram_idx:gram_idx +
                                                      len_match])
                assert words_similarity(matches_with_idx,
                                        match) > SIMILARITY_THRESHOLD
                match_indexes.append(gram_idx)

        return match_indexes
Example #15
0
def get_items_in_left_lst_but_not_in_right_lst(l1, l2):
    in_l1_but_not_in_l2 = []
    for ann in l1:
        found_pred = False
        for pred in l2:
            if words_similarity(ann['term'], pred['term']) > 0.8 and abs(
                    ann['start_offset'] - pred['start_offset']) <= 2:
                if ann['label'] == pred['label']:
                    found_pred = True
                    break
        if not found_pred:
            in_l1_but_not_in_l2.append(ann)

    return in_l1_but_not_in_l2
Example #16
0
def leave_only_unmatched_labels(matches_for_filename, post_labeled_terms, file_name):
    for m in matches_for_filename:
        if m['best_match'] not in post_labeled_terms:
            most_similar_t = None
            best_sim = -1
            for t in post_labeled_terms:
                sim = words_similarity(t['term'], m['best_match']['term'])
                if sim >= LOW_SIMILARITY_THRESHOLD and sim > best_sim:
                    most_similar_t = t
                    best_sim = sim
            if most_similar_t:
                post_labeled_terms.remove(most_similar_t)
            else:
                print(f"*** Couldn't remove {m}, file_name: {file_name}")
        else:
            post_labeled_terms.remove(m['best_match'])
    return post_labeled_terms
Example #17
0
def calculate_could_be_matched(could_be_matched, high_recall_matches_lst, label_row, unmatched_labels):
    for l in unmatched_labels:
        l_could_be_matched = False
        possible_matches = []
        for m in high_recall_matches_lst:
            if words_similarity(l['term'], m['term']) > LOW_SIMILARITY_THRESHOLD:
                l_could_be_matched = True
                m['type'] = 'high_rec'
                possible_matches.append(m)
                could_be_matched += 1
        if l_could_be_matched:
            print_could_be_matched = False
            if print_could_be_matched:
                print(f'l_could_be_matched, file_name: {label_row["file_name"]}')
                print(l)
                for s in possible_matches:
                    print(s)
                print()
    return could_be_matched