def LF_GiG_DOWNREGULATES(c): if re.search(ltp(downregulates_identifiers), " ".join(get_right_tokens(c[0], window=2)), flags=re.I): return -1 elif re.search(ltp(downregulates_identifiers), " ".join(get_right_tokens(c[1], window=2)), flags=re.I): return -1 elif re.search(ltp(downregulates_identifiers), get_text_between(c), flags=re.I): return -1 else: return 0
def LF_GiG_ASSOCIATION(c): if re.search(ltp(association_identifiers), " ".join(get_right_tokens(c[0], window=2)), flags=re.I): return -1 elif re.search(ltp(association_identifiers), " ".join(get_right_tokens(c[1], window=2)), flags=re.I): return -1 elif re.search(ltp(association_identifiers), get_text_between(c), flags=re.I): return -1 else: return 0
def LF_GiG_COMPOUND_IDENTIFICATIONS(c): if re.search(ltp(compound_indications), " ".join(get_right_tokens(c[0], window=2)), flags=re.I): return -1 elif re.search(ltp(compound_indications), " ".join(get_right_tokens(c[1], window=2)), flags=re.I): return -1 elif re.search(ltp(compound_indications), get_text_between(c), flags=re.I): return -1 else: return 0
def LF_GiG_GENE_IDENTIFIERS(c): cand1_text = " ".join(list(get_left_tokens(c[0], window=5)) + list(get_right_tokens(c[0], window=5))) cand2_text = " ".join(list(get_left_tokens(c[1], window=5)) + list(get_right_tokens(c[1], window=5))) if re.search(ltp(gene_identifiers), cand1_text, flags=re.I): return 1 elif re.search(ltp(gene_identifiers), cand2_text, flags=re.I): return 1 else: return 0
def LF_GiG_CELL_IDENTIFICATIONS(c): gene1_tokens = list(get_left_tokens(c[0], window=5)) + list(get_right_tokens(c[0], window=5)) gene2_tokens = list(get_left_tokens(c[0], window=5)) + list(get_right_tokens(c[0], window=5)) if re.search(ltp(cell_indications), " ".join(gene1_tokens), flags=re.I): return -1 elif re.search(ltp(cell_indications), " ".join(gene2_tokens), flags=re.I): return -1 else: return 0
def LF_CG_ANTIBODY(c): """ This label function is designed to look for phrase antibody. """ if "antibody" in c[1].get_span() or re.search("antibody", " ".join(get_right_tokens(c[1], window=3))): return 1 elif "antibodies" in c[1].get_span() or re.search("antibodies", " ".join(get_right_tokens(c[1], window=3))): return 1 else: return 0
def LF_GiG_BINDING_IDENTIFICATIONS(c): gene1_tokens = list(get_left_tokens(c[0], window=5)) + list(get_right_tokens(c[0], window=5)) gene2_tokens = list(get_left_tokens(c[0], window=5)) + list(get_right_tokens(c[0], window=5)) if re.search(ltp(binding_identifiers), " ".join(gene1_tokens), flags=re.I): return 1 elif re.search(ltp(binding_identifiers), " ".join(gene2_tokens), flags=re.I): return 1 elif re.search(ltp(binding_identifiers), get_text_between(c), flags=re.I): return 1 else: return 0
def LF_other_verbs(c): if (len(other_verbs.intersection(get_between_tokens(c))) > 0) and not neg_nearby(c): return 1 elif (len(other_verbs.intersection(get_left_tokens(c[0], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(other_verbs.intersection(get_left_tokens(c[1], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(other_verbs.intersection(get_right_tokens(c[0], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(other_verbs.intersection(get_right_tokens(c[1], window=20))) > 0) and not neg_nearby(c): return 1 else: return 0
def LF_isolate(c): if len(isolate.intersection(get_between_tokens(c))) > 0 and not neg_nearby(c): return 1 elif len(isolate.intersection(get_left_tokens(c[0], window=20))) > 0 and not neg_nearby(c): return 1 elif len(isolate.intersection(get_left_tokens(c[1], window=20))) > 0 and not neg_nearby(c): return 1 elif len(isolate.intersection(get_right_tokens(c[0], window=20))) > 0 and not neg_nearby(c): return 1 elif len(isolate.intersection(get_right_tokens(c[1], window=20))) > 0 and not neg_nearby(c): return 1 else: return 0
def LF_positive(c): if (len(positive.intersection(get_between_tokens(c))) > 0) and not neg_nearby(c): return 1 elif (len(positive.intersection(get_left_tokens(c[0], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(positive.intersection(get_left_tokens(c[1], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(positive.intersection(get_right_tokens(c[0], window=20))) > 0) and not neg_nearby(c): return 1 elif (len(positive.intersection(get_right_tokens(c[1], window=20))) > 0) and not neg_nearby(c): return 1 else: return 0
def LF_DG_GENETIC_ABNORMALITIES(c): """ This LF searches for key phraes that indicate a genetic abnormality """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if re.search(ltp(genetic_abnormalities), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(genetic_abnormalities), left_window, flags=re.I): return 1 elif re.search(ltp(genetic_abnormalities), right_window, flags=re.I): return 1 return 0
def LF_DaG_DISEASE_SAMPLE(c): """ This LF is designed to look for key phrases that indicate a sentence talking about tissue samples ex. cell line etc """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if re.search(ltp(disease_sample_indicators), left_window, flags=re.I): return 1 elif re.search(ltp(disease_sample_indicators), right_window, flags=re.I): return 1 else: return 0
def LF_DaG_CELLULAR_ACTIVITY(c): """ This LF is designed to look for key phrases that indicate activity within a cell. e.x. positive immunostating for an experiment """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if re.search(ltp(cellular_activity), get_tagged_text(c), flags=re.I): return 1 elif re.search(ltp(cellular_activity), left_window, flags=re.I): return 1 elif re.search(ltp(cellular_activity), right_window, flags=re.I): return 1 else: return 0
def LF_DEBUG(c): """ This label function is for debugging purposes. Feel free to ignore. keyword arguments: c - The candidate object to be labeled """ print(c) print() print("Left Tokens") print(list(get_left_tokens(c[0], window=5))) print() print("Right Tokens") print(list(get_right_tokens(c[0]))) print() print("Between Tokens") print(list(get_between_tokens(c))) print() print("Tagged Text") print(get_tagged_text(c)) print(re.search(r'{{B}} .* is a .* {{A}}', get_tagged_text(c))) print() print("Get between Text") print(get_text_between(c)) print(len(get_text_between(c))) print() print("Parent Text") print(c.get_parent()) print() return 0
def LF_gene(c): """ If candidate has gene word near it """ if "gene" in get_left_tokens(c[1]) or "gene" in get_right_tokens(c[1]): return 1 return 0
def LF_CD_CHECK_DEPRESSION_USAGE(c): if "depress" in c[1].get_span(): if re.search(ltp(incorrect_depression_indication), " ".join(get_left_tokens(c[0], window=5)), flags=re.I): return -1 elif re.search(ltp(incorrect_depression_indication), " ".join(get_right_tokens(c[0], window=5)), flags=re.I): return -1 return 0
def LF_CD_TREATS(c): if re.search(ltp(treat_indication), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(treat_indication), " ".join(get_left_tokens(c[0], window=5)), flags=re.I): return 1 elif re.search(ltp(treat_indication), " ".join(get_right_tokens(c[0], window=5)), flags=re.I): return 1 else: return 0
def LF_neg_words(c): """ If it mentions serum or intervention before or after gene then negative """ if len(neg_words.intersection(get_left_tokens(c[1], window=3))) > 0: return -1 if len(neg_words.intersection(get_right_tokens(c[1], window=3))) > 0: return -1 return 0
def LF_DaG_NO_ASSOCIATION(c): """ This LF is designed to test if there is a key phrase that suggests a d-g pair is no an association. """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 elif re.search(ltp(no_direct_association), get_text_between(c), flags=re.I): return -1 elif re.search(ltp(no_direct_association), left_window, flags=re.I): return -1 elif re.search(ltp(no_direct_association), right_window, flags=re.I): return -1 else: return 0
def LF_DaG_WEAK_ASSOCIATION(c): """ This label function is design to search for phrases that indicate a weak association between the disease and gene """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 elif re.search(ltp(weak_association), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(weak_association), left_window, flags=re.I): return 1 elif re.search(ltp(weak_association), right_window, flags=re.I): return 1 else: return 0
def LF_variation(c): """ If variation keyword in close proximity then label as positive """ if len(variation_words.intersection(get_left_tokens(c[1]))) > 0: return 1 if len(variation_words.intersection(get_right_tokens(c[1]))) > 0: return 1 return 0
def LF_DaG_ASSOCIATION(c): """ This LF is designed to test if there is a key phrase that suggests a d-g pair is an association. """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) found_negation = not re.search(r'\b(not|no)\b', left_window, flags=re.I) if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 elif re.search(r'(?<!not )(?<!no )' + ltp(direct_association), get_text_between(c), flags=re.I) and found_negation: return 1 elif re.search(r'(?<!not )(?<!no )' + ltp(direct_association), left_window, flags=re.I) and found_negation: return 1 elif re.search(r'(?<!not )(?<!no )' + ltp(direct_association), right_window, flags=re.I) and found_negation: return 1 else: return 0
def LF_CG_DOWNREGULATES(c): """ This label function is designed to look for phrases that could implies a compound decreasing the activity of a gene/protein """ if re.search(ltp(downregulates), get_text_between(c), flags=re.I): return 1 elif downregulates.intersection(get_right_tokens(c[1], window=2)): return 1 else: return 0
def LF_CG_GENE_RECEIVERS(c): """ This label function is designed to look for phrases that imples a kinases or sort of protein that receives a stimulus to function """ if re.search(ltp(gene_receivers), " ".join(get_right_tokens(c[1], window=4))) or re.search(ltp(gene_receivers), " ".join(get_left_tokens(c[1], window=4))): return 1 elif re.search(ltp(gene_receivers), c[1].get_span(), flags=re.I): return 1 else: return 0
def LF_neg_assertions(c): if (len(negative.intersection(get_between_tokens(c))) > 0): return -1 elif (len(negative.intersection(get_left_tokens(c[0], window=10))) > 0): return -1 elif (len(negative.intersection(get_left_tokens(c[1], window=20))) > 0): return -1 elif (len(negative.intersection(get_right_tokens(c[0], window=20))) > 0): return -1 # elif (len(negative.intersection(get_right_tokens(c[1], window=20))) > 0): # return -1 else: return 0
def LF_CD_COMPOUND_INDICATION(c): """ This label function is designed to look for phrases that implies a compound increaseing activity of a gene/protein """ if re.search(ltp(compound_indications), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(compound_indications), " ".join(get_left_tokens(c[0], window=5)), flags=re.I): return 1 elif re.search(ltp(compound_indications), " ".join(get_right_tokens(c[0], window=5)), flags=re.I): return 1 else: return 0
def LF_CD_PALLIATES(c): """ This label function is designed to look for phrases that could imply a compound binding to a gene/protein """ if re.search(ltp(palliates_indication), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(palliates_indication), " ".join(get_left_tokens(c[0], window=5)), flags=re.I): return 1 elif re.search(ltp(palliates_indication), " ".join(get_right_tokens(c[0], window=5)), flags=re.I): return 1 else: return 0
def LF_CD_WEAKLY_TREATS(c): """ This label function is designed to look for phrases that imply a compound binding to a gene/protein """ if re.search(ltp(weak_treatment_indications), get_text_between(c), flags=re.I): return 1 elif re.search(ltp(weak_treatment_indications), " ".join(get_left_tokens(c[0], window=5)), flags=re.I): return 1 elif re.search(ltp(weak_treatment_indications), " ".join(get_right_tokens(c[0], window=5)), flags=re.I): return 1 else: return 0
def LF_DEBUG(C): print "Left Tokens" print get_left_tokens(c, window=3) print print "Right Tokens" print get_right_tokens(c) print print "Between Tokens" print get_between_tokens(c) print print "Tagged Text" print get_tagged_text(c) print re.search(r'{{B}} .* is a .* {{A}}', get_tagged_text(c)) print print "Get between Text" print get_text_between(c) print len(get_text_between(c)) print print "Parent Text" print c.get_parent() print return 0
def LF_DG_IS_BIOMARKER(c): """ This label function examines a sentences to determine of a sentence is talking about a biomarker. (A biomarker leads towards D-G assocation c - The candidate obejct being passed in """ if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 elif re.search(ltp(biomarker_indicators), " ".join(get_left_tokens(c[1], window=10)), flags=re.I): return 1 elif re.search(ltp(biomarker_indicators), " ".join(get_right_tokens(c[1], window=10)), flags=re.I): return 1 else: return 0
def _get_search_func(self, c): """ Enumerate the token search space for pattern matching :param c: :return: """ if self.search == "sentence": return c.get_parent().__dict__[self.attrib] elif self.search == "between": return get_text_between(c).strip().split() elif self.search == "left": # use left-most Span span = c[0] if c[0].char_start < c[1].char_start else c[1] return get_left_tokens(span, window=self.window, attrib=self.attrib) elif self.search == "right": # use right-most Span span = c[0] if c[0].char_start > c[1].char_start else c[1] return get_right_tokens(span, window=self.window, attrib=self.attrib)
def LF_and_married(c): return 1 if 'and' in get_between_tokens(c) and 'married' in get_right_tokens(c) else 0