def LF_DEBUG(c): """ This label function is for debugging purposes. Feel free to ignore. keyword arguments: c - The candidate object to be labeled """ print(c) print() print("Left Tokens") print(list(get_left_tokens(c[0], window=5))) print() print("Right Tokens") print(list(get_right_tokens(c[0]))) print() print("Between Tokens") print(list(get_between_tokens(c))) print() print("Tagged Text") print(get_tagged_text(c)) print(re.search(r'{{B}} .* is a .* {{A}}', get_tagged_text(c))) print() print("Get between Text") print(get_text_between(c)) print(len(get_text_between(c))) print() print("Parent Text") print(c.get_parent()) print() return 0
def LF_DG_CONCLUSION_TITLE(c): """" This label function searches for the word conclusion at the beginning of the sentence. Some abstracts are written in this format. """ return 1 if "CONCLUSION:" in get_tagged_text( c) or "concluded" in get_tagged_text(c) else 0
def LF_CG_IN_SERIES(c): """ This label function is designed to look for a mention being caught in a series of other genes or compounds """ if len(re.findall(r',', get_tagged_text(c))) >= 2: if re.search(', and', get_tagged_text(c)): return -1 return 0
def LF_DG_TITLE(c): """ This label function is designed to look for phrases that inditcates a paper title """ if re.search(r'^' + ltp(title_indication), get_tagged_text(c), flags=re.I): return -1 elif re.search(ltp(title_indication) + r'$', get_tagged_text(c), flags=re.I): return -1 else: return 0
def LF_DG_METHOD_DESC(c): """ This label function is designed to look for phrases that imply a sentence is description an experimental design """ #TODO FIX for words that change the sentence menaing from methods to results if "we found" in get_tagged_text(c): return 0 if re.search(ltp(method_indication), get_tagged_text(c), flags=re.I): return -1 else: return 0
def LF_DG_TITLE(c): """ This label function is designed to look for phrases that inditcates a paper title """ if re.search(r'^(\[|\[ )?'+ltp(title_indication), get_tagged_text(c), flags=re.I): return -1 elif re.search(ltp(title_indication)+r'$', get_tagged_text(c), flags=re.I): return -1 elif "(author's transl)" in get_tagged_text(c): return -1 elif ":" in get_between_tokens(c): return -1 else: return 0
def LF_DG_RISK(c): """ This label function searched for sentences that mention a patient being at risk for disease or a signal implying increased/decreased risk of disease. """ return 1 if re.search(r"risk (of|for)", get_tagged_text(c), flags=re.I) else 0
def LF_DG_NEGATIVE_DIRECTION(c): """ This label function is designed to search for words that indicate a sort of negative response or imply an downregulates association """ return 1 if any([rule_regex_search_btw_AB(c, r'.*'+ltp(negative_direction)+r'.*', 1), rule_regex_search_btw_BA(c, r'.*'+ltp(negative_direction)+r'.*', 1)]) or \ re.search(r'({{A}}|{{B}}).*({{A}}|{{B}}).*' + ltp(negative_direction), get_tagged_text(c)) else 0
def LF_DG_IS_BIOMARKER(c): """ This label function examines a sentences to determine of a sentence is talking about a biomarker. (A biomarker leads towards D-G assocation c - The candidate obejct being passed in """ if re.search(ltp(biomarker_indicators) + r".*{{B}}", get_tagged_text(c), flags=re.I): return 1 elif re.search(r"{{B}}.*" + ltp(biomarker_indicators), get_tagged_text(c), flags=re.I): return 1 else: return 0
def LF_DG_DIAGNOSIS(c): """ This label function is designed to search for words that imply a patient diagnosis which will provide evidence for possible disease gene association. """ return 1 if any([rule_regex_search_btw_AB(c, r'.*'+ltp(diagnosis_indicators) + r".*", 1), rule_regex_search_btw_BA(c, r'.*'+ltp(diagnosis_indicators) + r".*", 1)]) or \ re.search(r'({{A}}|{{B}}).*({{A}}|{{B}}).*' + ltp(diagnosis_indicators), get_tagged_text(c)) else 0
def LF_DG_PATIENT_WITH(c): """ This label function looks for the phrase " with" disease. """ return 1 if re.search(r"patient(s)? with.{1,200}{{A}}", get_tagged_text(c), flags=re.I) else 0
def LF_DISEASE_SUFFIX(c): """ This LF is designed to confirm that the entity labeld as gene is really a disease. It looks for key phrases/words that will suggest the possibility of the tagged entity being a disease. """ return 1 if re.search(r'{{A}} ' + ltp(gene_suffix_indicators), get_tagged_text(c) re.I) else 0
def LF_DG_NO_ASSOCIATION(c): """ This LF is designed to test if there is a key phrase that suggests a d-g pair is no an association. """ if re.search(ltp(no_direct_association), get_text_between(c), flags=re.I): return -1 elif re.search(ltp(no_direct_association) + r".*({{B}}|{{A}})", get_tagged_text(c), flags=re.I): return -1 elif re.search(r"({{B}}|{{A}}).*" + ltp(no_direct_association), get_tagged_text(c), flags=re.I): return -1 else: return 0
def LF_DG_WEAK_ASSOCIATION(c): """ This label function is design to search for phrases that indicate a weak association between the disease and gene """ if re.search(ltp(weak_association), get_text_between(c), flags=re.I): return -1 elif re.search(ltp(weak_association) + r".*({{B}}|{{A}})", get_tagged_text(c), flags=re.I): return -1 elif re.search(r"({{B}}|{{A}}).*" + ltp(weak_association), get_tagged_text(c), flags=re.I): return -1 else: return 0
def LF_CD_METHOD_DESC(c): """ This label function is designed to look for phrases that imply a sentence is description an experimental design """ if re.search(ltp(method_indication), get_tagged_text(c), flags=re.I): return -1 else: return 0
def LF_DEBUG(C): print "Left Tokens" print get_left_tokens(c, window=3) print print "Right Tokens" print get_right_tokens(c) print print "Between Tokens" print get_between_tokens(c) print print "Tagged Text" print get_tagged_text(c) print re.search(r'{{B}} .* is a .* {{A}}', get_tagged_text(c)) print print "Get between Text" print get_text_between(c) print len(get_text_between(c)) print print "Parent Text" print c.get_parent() print return 0
def LF_DaG_ASSOCIATION(c): """ This LF is designed to test if there is a key phrase that suggests a d-g pair is an association. """ if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 elif re.search(r'(?<!not )(?<!no )' + ltp(direct_association), get_text_between(c), flags=re.I): return 1 elif re.search(r'(?<!not )(?<!no )' + ltp(direct_association) + r".*({{B}}|{{A}})", get_tagged_text(c), flags=re.I): return 1 elif re.search(r"({{B}}|{{A}}).*(?<!not )(?<!no )" + ltp(direct_association), get_tagged_text(c), flags=re.I): return 1 else: return 0
def LF_DdG_DOWNREGULATES(c): """ This label function is designed to search for words that indicate a sort of negative response or imply an downregulates association """ if LF_DG_METHOD_DESC(c) or LF_DG_TITLE(c): return 0 else: if rule_regex_search_btw_AB(c, r'.*'+ltp(downregulates)+r'.*', 1): return 1 elif rule_regex_search_btw_BA(c, r'.*'+ltp(downregulates)+r'.*', 1): return 1 elif re.search(r'({{A}}|{{B}}).*({{A}}|{{B}}).*' + ltp(downregulates), get_tagged_text(c)): return 1 else: return 0
def LF_DaG_CELLULAR_ACTIVITY(c): """ This LF is designed to look for key phrases that indicate activity within a cell. e.x. positive immunostating for an experiment """ left_window = " ".join(get_left_tokens(c[0], window=10)) + " ".join(get_left_tokens(c[1], window=10)) right_window = " ".join(get_right_tokens(c[0], window=10)) + " ".join(get_right_tokens(c[1], window=10)) if re.search(ltp(cellular_activity), get_tagged_text(c), flags=re.I): return 1 elif re.search(ltp(cellular_activity), left_window, flags=re.I): return 1 elif re.search(ltp(cellular_activity), right_window, flags=re.I): return 1 else: return 0
def LF_CtD_TRIAL(c): return 1 if re.search( ltp(trial_indications), get_tagged_text(c), flags=re.I) else 0
def rule_regex_search_before_A(candidate, pattern, sign): """ Check if regex before expresision A """ return sign if re.search( pattern + r'*{{A}}', get_tagged_text(candidate), flags=re.I) else 0
def LF_DdG_METHYLATION(c): if "methylation" in get_tagged_text(c): return 1 return 0
def LF_GG_IN_SERIES(c): if len(re.findall(r',', get_tagged_text(c))) >= 2: if re.search(', and', get_tagged_text(c)): return -1 return 0
def LF_positive2(c): return 1 if (re.search( r'{{A}}.{0,100} ' + ltp(positive_l) + '.{0,100}{{B}}', get_tagged_text(c), re.I) and not re.search( '{{A}}.{0,100}(not|no|negative).{0,20}' + ltp(positive_l) + '.{0,100}{{B}}', get_tagged_text(c), re.I)) else 0
def LF_neg_h(c): return -1 if re.search( neg_rgx + '.{0,50}{{B}}', get_tagged_text(c), flags=re.I) else 0
def LF_h_v(c): return 1 if (re.search(r'{{B}}.{0,250}{{A}}', get_tagged_text(c), re.I) and not re.search(neg_rgx, get_tagged_text(c), re.I)) else 0
def LF_v_cause_h(c): return 1 if (re.search(r'{{A}}.{0,50} ' + ltp(causal) + '.{0,50}{{B}}', get_tagged_text(c), re.I) and not re.search( '{{A}}.{0,50}(not|no|negative).{0,20}' + ltp(causal) + '.{0,50}{{B}}', get_tagged_text(c), re.I)) else 0
LF_disease_context ] # # Test out Label Functions # In[ ]: labeled = [] candidates = session.query(DiseaseGene).filter(DiseaseGene.split == 0).all() #candidates = [session.query(DiseaseGene).filter(DiseaseGene.id == ids).one() for ids in [19817,19818,19830,19862,19980,20001,20004]] for c in candidates: if c[0].get_parent().id != 14264: continue print c print get_tagged_text(c) print c[1].sentence.entity_cids[c[1].get_word_start()] # # Label The Candidates # This block of code will run through the label functions and label each candidate in the training and development groups. # In[ ]: labeler = LabelAnnotator(f=LFs) get_ipython().magic(u'time L_train = labeler.apply(split=0)') get_ipython().magic(u'time L_dev = labeler.apply_existing(split=1)') get_ipython().magic(u'time L_test = labeler.apply_existing(split=2)') # In[ ]:
def LF_CD_TRIAL(c): return 1 if ltp(trial_indications) in get_tagged_text(c) else 0
def LF_DG_PURPOSE(c): """" This label function searches for the word purpose at the beginning of the sentence. Some abstracts are written in this format. """ return -1 if "PURPOSE:" in get_tagged_text(c) else 0