def no_longer_match_that_contains_m_in_same_span(self, matches, m, row): curr_len = len(m['umls_match'].split(" ")) for other_m in matches: if other_m['curr_occurence_offset'] is None: print(f"other_m['curr_occurence_offset'] is None") continue other_len = len(other_m['umls_match'].split(" ")) if other_len > curr_len: other_m_with_same_len_as_curr = " ".join( other_m['umls_match'].split(" ")[:curr_len]) if len(other_m['umls_match'].split(" ") [curr_len]) >= 3 and words_similarity( other_m_with_same_len_as_curr, m['umls_match']) > 0.88: other_span_contains_curr = self.does_other_span_contains_curr( m, other_m, row['file_name']) if other_span_contains_curr: return False other_m_with_same_len_as_curr = " ".join( other_m['umls_match'].split(" ")[curr_len:]) if len(other_m['umls_match'].split(" ") [curr_len]) >= 3 and words_similarity( other_m_with_same_len_as_curr, m['umls_match']) > 0.88: other_span_contains_curr = self.does_other_span_contains_curr( m, other_m, row['file_name']) if other_span_contains_curr: return False return True
def get_best_match(relevant_labeled_df, row, final_label_col): best_match = None best_match_sim = 0 row_start_offset = row['curr_occurence_offset'] row_end_offset = row_start_offset + len(row['cand_match']) for match_data in relevant_labeled_df[final_label_col]: term = match_data['term'] sim1 = words_similarity(term, row['cand_match']) sim2 = words_similarity(term, row['umls_match']) higher_sim = max(sim1, sim2) if higher_sim > LOW_SIMILARITY_THRESHOLD and spans_match_lemma(match_data, row_start_offset, row_end_offset): best_match_sim = higher_sim best_match = match_data return best_match, best_match_sim
def find_occs_offsets_in_txt(cand_match_occ_in_key_txt, cand_term, post_txt): txt_words = post_txt.split(" ") cand_match_occ_in_key_txt_len = len(cand_match_occ_in_key_txt.split(" ")) all_match_occ = [] for i in range(len(txt_words)): relevant_words = txt_words[i:i + cand_match_occ_in_key_txt_len] if len(relevant_words) > 1 and (relevant_words[1] == '' or relevant_words[1] == ' ') and len(txt_words) > i + cand_match_occ_in_key_txt_len: relevant_words.append(txt_words[i + cand_match_occ_in_key_txt_len]) relevant_term = " ".join(relevant_words) if words_similarity(relevant_term, cand_term) > 0.82 or words_similarity(relevant_term, cand_match_occ_in_key_txt) > 0.82: occurence_offset = post_txt.index(relevant_term) all_match_occ.append(occurence_offset) return all_match_occ
def are_all_instances_of_term_are_contained(self, m, other_m, row): txt_words = row['tokenized_text'].split(" ") indexes_with_term = [] for w_idx, w in enumerate(txt_words): if words_similarity(w, m['umls_match']) > SIMILARITY_THRESHOLD: indexes_with_term.append(w_idx) first_container_w, second_container_w = other_m['umls_match'].split( " ") all_instances_of_term_are_contained = all( words_similarity(txt_words[i + 1], second_container_w) > SIMILARITY_THRESHOLD for i in indexes_with_term) # if all_instances_of_term_are_contained: # print(f"all_instances_of_term_are_contained: {m['umls_match']}, {other_m['umls_match']}") return all_instances_of_term_are_contained
def find_possible_cand_matches_occurence_in_txt_by_lemma(cand_term, row): match_occs_in_txt = [] cand_term_len = len(cand_term.split(" ")) if cand_term_len == 1: for l, w in [x.split(" ") for x in row['words_and_lemmas']]: if words_similarity(cand_term, l) > SIMILARITY_THRESHOLD: match_occs_in_txt.append(w) else: for i in range(len(row['words_and_lemmas']) - cand_term_len + 1): relevant_words_and_lemmas = row['words_and_lemmas'][i:i + cand_term_len] words_cand = " ".join([w.split(" ")[1] for w in relevant_words_and_lemmas]) lemmas_cand = " ".join([w.split(" ")[0] for w in relevant_words_and_lemmas]) if words_similarity(lemmas_cand, cand_term) > SIMILARITY_THRESHOLD: match_occs_in_txt.append(words_cand) return match_occs_in_txt
def match_not_similar_to_short_match(self, match, all_short_matches): similar = False for m in all_short_matches: if words_similarity(m, match) > SIMILARITY_THRESHOLD: similar = True break return not similar
def find_umls_match_fast(msg_txt, searcher, row, msg_key_lang): if msg_txt == "": return [] ngrams = prepare_msg_ngrams(msg_txt) all_matches_found = [] for gram in ngrams: for i in range(1, NUMBER_OF_GRAMS + 1): low_similarity_threshold = LOW_SINGLE_WORD_SIMILARITY_THRESHOLD if i == 1 else LOW_MULTI_WORD_SIMILARITY_THRESHOLD up_similarity_threshold = UP_SINGLE_WORD_SIMILARITY_THRESHOLD if i == 1 else UP_MULTI_WORD_SIMILARITY_THRESHOLD cand_term = " ".join(gram[:i]) if term_is_exception(i, cand_term): continue search_result = searcher.ranked_search(cand_term, low_similarity_threshold) if search_result != []: cosine_sim, umls_match = search_result[0] # Cosine-Sim. I can demand another sim sim = words_similarity(umls_match, cand_term) if is_good_match(sim, umls_match, i, up_similarity_threshold): all_matches_found = add_match_data(all_matches_found, cand_term, msg_key_lang, row, sim, umls_match) all_matches_found_with_full_occs = get_matches_with_full_occs(all_matches_found) return all_matches_found_with_full_occs
def find_match_occ_for_cand_term_and_text(cand_term, match_occs_in_txt, tokenized_words_lst): cand_term_len = len(cand_term.split(" ")) for i in range(len(tokenized_words_lst) - cand_term_len + 1): relevant_words = tokenized_words_lst[i:i + cand_term_len] words_cand = " ".join(relevant_words) if words_similarity(words_cand, cand_term) > SIMILARITY_THRESHOLD: match_occs_in_txt.append(words_cand) return match_occs_in_txt
def try_to_find_with_next_term(all_possible_word_rows, cand_match, df, match_len): best_idx_with_next_word = None best_sim = 0 for idx in all_possible_word_rows.index: term_with_next_word = " ".join(df['word'].loc[idx:idx + match_len - 1]) sim = words_similarity(term_with_next_word, cand_match) if sim > SIMILARITY_THRESHOLD and sim > best_sim: best_sim = sim best_idx_with_next_word = idx return best_idx_with_next_word
def get_correct_row_from_df(df, cand_match, umls_match, row): number_of_match = row['all_match_occ'].index(row['curr_occurence_offset']) cand_match_parts = cand_match.split(" ") match_len = len(cand_match_parts) cand_match_first_term = cand_match_parts[0] all_possible_word_rows = df[df['word'].apply(lambda x: words_similarity( x, cand_match_first_term) > SIMILARITY_THRESHOLD)] if len(row['all_match_occ']) == len(all_possible_word_rows): correct_row = all_possible_word_rows.iloc[number_of_match] else: all_possible_lemma_rows = df[ df['lemma'].apply(lambda x: words_similarity( x, cand_match_first_term) > SIMILARITY_THRESHOLD)] if len(row['all_match_occ']) == len(all_possible_lemma_rows): correct_row = all_possible_lemma_rows.iloc[number_of_match] else: try: best_idx_with_prefix = try_to_find_with_prefix( all_possible_word_rows, cand_match, df, match_len) except Exception as ex: best_idx_with_prefix = None if best_idx_with_prefix: correct_row = df.loc[best_idx_with_prefix] else: best_idx_with_next_word = try_to_find_with_next_term( all_possible_word_rows, cand_match, df, match_len) if best_idx_with_next_word: correct_row = df.loc[best_idx_with_next_word] else: global number_of_sorts number_of_sorts += 1 df['word_sim_to_cand_match'] = df['word'].apply( lambda x: words_similarity(x, cand_match_first_term)) df.sort_values(by='word_sim_to_cand_match', ascending=False, inplace=True) correct_row = df.iloc[0] return correct_row
def get_yi_for_cand_match(label_row, row): row_start_offset = row['curr_occurence_offset'] row_end_offset = row['curr_occurence_offset'] + len(row['cand_match']) found_label_for_row = False best_match = None for ann in label_row[FINAL_LABELS_COL]: if spans_match(ann, row_start_offset, row_end_offset): if words_similarity(ann['term'], row['cand_match']) >= LOW_SIMILARITY_THRESHOLD: found_label_for_row = True best_match = ann yi = 1 if found_label_for_row else 0 return yi, best_match
def assert_terms_in_place(row): text = row['text'] annotations = row[FINAL_LABELS_COL] predictions = row['prediction_labels'] local_sim_thresh = 0.72 for ann in annotations: assert words_similarity(text[ann['start_offset']:ann['end_offset']], ann['term']) > local_sim_thresh not_inplace_terms = [] for p in predictions: sim = words_similarity(text[p['start_offset']:p['end_offset']], p['term']) if sim < local_sim_thresh: not_inplace_terms.append(p) if len(not_inplace_terms): # print(f"not_inplace_terms: {not_inplace_terms}") # print(f"anns: {[t['term'] for t in annotations]}") predictions = [p for p in predictions if p not in not_inplace_terms] return predictions
def get_items_in_both_lsts(l1, l2): in_both_lsts = [] for ann in l1: found_pred = False for pred in l2: if words_similarity(ann['term'], pred['term']) > 0.8 and abs( ann['start_offset'] - pred['start_offset']) <= 2: if ann['label'] == pred['label']: found_pred = True break if found_pred: in_both_lsts.append(ann) return in_both_lsts
def get_all_occurences_of_match_in_text(self, match, txt_words): match_indexes = [] NUMBER_OF_GRAMS = 3 len_match = len(match.split(" ")) ngrams = list(zip(*[txt_words[i:] for i in range(NUMBER_OF_GRAMS)])) if len(ngrams) > 0: last_gram = ngrams[-1] extra_gram = last_gram[1], last_gram[2], 'PAD' ngrams.append(extra_gram) extra_gram_2 = last_gram[2], 'PAD', 'PAD' ngrams.append(extra_gram_2) for gram_idx, gram in enumerate(ngrams): cand_term = " ".join(gram[:len_match]) if words_similarity(cand_term, match) > SIMILARITY_THRESHOLD: matches_with_idx = " ".join(txt_words[gram_idx:gram_idx + len_match]) assert words_similarity(matches_with_idx, match) > SIMILARITY_THRESHOLD match_indexes.append(gram_idx) return match_indexes
def get_items_in_left_lst_but_not_in_right_lst(l1, l2): in_l1_but_not_in_l2 = [] for ann in l1: found_pred = False for pred in l2: if words_similarity(ann['term'], pred['term']) > 0.8 and abs( ann['start_offset'] - pred['start_offset']) <= 2: if ann['label'] == pred['label']: found_pred = True break if not found_pred: in_l1_but_not_in_l2.append(ann) return in_l1_but_not_in_l2
def leave_only_unmatched_labels(matches_for_filename, post_labeled_terms, file_name): for m in matches_for_filename: if m['best_match'] not in post_labeled_terms: most_similar_t = None best_sim = -1 for t in post_labeled_terms: sim = words_similarity(t['term'], m['best_match']['term']) if sim >= LOW_SIMILARITY_THRESHOLD and sim > best_sim: most_similar_t = t best_sim = sim if most_similar_t: post_labeled_terms.remove(most_similar_t) else: print(f"*** Couldn't remove {m}, file_name: {file_name}") else: post_labeled_terms.remove(m['best_match']) return post_labeled_terms
def calculate_could_be_matched(could_be_matched, high_recall_matches_lst, label_row, unmatched_labels): for l in unmatched_labels: l_could_be_matched = False possible_matches = [] for m in high_recall_matches_lst: if words_similarity(l['term'], m['term']) > LOW_SIMILARITY_THRESHOLD: l_could_be_matched = True m['type'] = 'high_rec' possible_matches.append(m) could_be_matched += 1 if l_could_be_matched: print_could_be_matched = False if print_could_be_matched: print(f'l_could_be_matched, file_name: {label_row["file_name"]}') print(l) for s in possible_matches: print(s) print() return could_be_matched