def cand_creation(df_subset): #Generating Candidates global T1 T1 = candidate_subclass('T1', ['Features']) r = '^-?\d*.\d*' ngrams = Ngrams(n_max=300) regex_matcher = RegexMatchEach(rgx=r) cand_extractor = CandidateExtractor(T1, [ngrams], [regex_matcher]) return T1, cand_extractor
def def_cand_extractor(): """ Defines a candidate extractor Make necessary changes to cand subclass, span, matcher and cand extractor :return: candExtractor, cSubClass """ Text = candidate_subclass('Text', ['text'], values=['Positive', 'Negative', False]) sent_span = SentCandidate() defaultMatcher = Matcher() cand_extractor = CandidateExtractor(Text, [sent_span], [defaultMatcher]) return cand_extractor, Text
def get_segment_class_and_matcher(name,ngrams=None,non_comma_matcher=DictionaryMatch(d=[','],longest_match_only=True,reverse=True)): transition_prev_work=DictionaryMatch(d=['previous','earlier','past'],longest_match_only=True) if name.lower()=="background": Background = candidate_subclass('Background', ['background_cue']) transition_word=DictionaryMatch(d=['while','unlike','despite'],longest_match_only=True) dict_background_matcher=DictionaryMatch(d=['previous work','traditionally','researchers'],longest_match_only=True) excluded_dict_background_matcher=DictionaryMatch(d=['we','unlike','our'],longest_match_only=True,reverse=True) non_comma_dict_background_matcher=CandidateExtractor(Background, [ngrams], [Intersection(non_comma_matcher,Union(dict_background_matcher,Intersection(transition_word,transition_prev_work)),excluded_dict_background_matcher)]) return Background,non_comma_dict_background_matcher elif name.lower()=="purpose": Purpose=candidate_subclass('Purpose',['purpose_cue']) transition_regex_matcher=RegexMatchSpan(rgx="((^|\s)however.*$)|((^|\s)but(?!(also))*$)",longest_match_only=True) # Correction: purpose excluded_dict_purpose_matcher=SentenceMatch(d=['but also','but without','but sometimes'],longest_match_only=True,reverse=True) # the parent sentence shall not include "but also" transition_matcher=Intersection(transition_regex_matcher,excluded_dict_purpose_matcher) comparative_degree_matcher=Intersection(RegexMatchSpan(rgx="(.*more.*than.*$)|(.*er than.*$)",longest_match_only=True),transition_prev_work) # Correction: purpose other_regex_matcher=RegexMatchSpan(rgx="(.*extend.*$)|(.*offer.*$)",longest_match_only=True) dict_purpose_matcher=DictionaryMatch(d=['in this paper','in the paper',' that can ','in this study','to examine','we examine','to investigate','implications'],longest_match_only=True) non_comma_dict_purpose_matcher=CandidateExtractor(Purpose, [ngrams], [Intersection(non_comma_matcher,Union(comparative_degree_matcher,other_regex_matcher,dict_purpose_matcher,transition_matcher))]) #,intersection(excluded_dict_purpose_matcher,transition_regex_matcher)]) return Purpose, non_comma_dict_purpose_matcher elif name.lower()=="mechanism": Mechanism = candidate_subclass('Mechanism', ['mechanism_cue']) dict_mechanism_matcher=DictionaryMatch(d=['introduce','introduces','propose','proposes','we propose','we develop','approach'],longest_match_only=True) non_comma_dict_mechanism_matcher=CandidateExtractor(Mechanism, [ngrams], [Intersection(non_comma_matcher,dict_mechanism_matcher)]) return Mechanism, non_comma_dict_mechanism_matcher elif name.lower()=="method": Method = candidate_subclass('Method', ['method_cue']) dict_method_matcher=DictionaryMatch(d=['dataset','benchmark','experiment ','experiments',"empirical","participant","survey"," conduct"," analyze"],longest_match_only=True) non_comma_dict_method_matcher=CandidateExtractor(Method, [ngrams], [Intersection(non_comma_matcher,dict_method_matcher)]) return Method, non_comma_dict_method_matcher elif name.lower()=="finding": Finding = candidate_subclass('Finding', ['finding_cue']) dict_finding_matcher=DictionaryMatch(d=['show that','shows that','found','indicate','results','performance','find'],longest_match_only=True) non_comma_dict_finding_matcher=CandidateExtractor(Finding, [ngrams], [Intersection(non_comma_matcher,dict_finding_matcher)]) return Finding, non_comma_dict_finding_matcher elif name.lower()=="general": General=candidate_subclass('General',['general_cue']) general_extractor=CandidateExtractor(General,[ngrams],[non_comma_matcher]) return General,general_extractor
bucket = int(ind / docs_per_bucket) for s in doc.sentences: sents_split[bucket] += [s] print("Number of buckets: (should have around ~100 buckets??)", len(sents_split)) from snorkel.models import candidate_subclass from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.matchers import * import datetime Unigram = candidate_subclass('Unigram', ['unigram_cue'], values=['PP', 'MN', 'NULL']) ngrams = Ngrams(n_max=1) ngram_matcher = NgramMatcher() unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams], [ngram_matcher]) # from snorkel.lf_helpers import * from snorkel.annotations import LabelAnnotator # from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs from LF.util_common_default_categorical_onset_1026 import * # purpose_LFs,mechanism_LFs,null_LFs print("total LF count", len(purpose_LFs + mechanism_LFs + null_LFs), "unique count", len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs", len(purpose_LFs), "mechanism_LFs", len(mechanism_LFs)) print("\n\npurpose_LFs\n", [lf.__name__ for lf in purpose_LFs]) print("\n\nmechanism_LFs\n", [lf.__name__ for lf in mechanism_LFs]) print("\n\nnull_LFs\n", [lf.__name__ for lf in null_LFs])
session = SnorkelSession() n_docs = 500 doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords # List to parenthetical def ltp(x): return '(' + '|'.join(x) + ')' # -------------------------------- # Positive LFs:
##### snorkeling session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs): for s in doc.sentences: sents.add(s) cand_extractor.apply(sents) print("Number of candidates:", session.query(pairs).count())
max_docs=350) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) Sensitive = candidate_subclass('Sensitive', ['sensitive'], values=[ 'person', 'job', 'event', 'place', 'date', 'time', 'product', 'email', 'phone', 'quantity', 'address', 'url', 'org', 'file', 'password', False ]) # generating candidates. ngrams = Ngrams(n_max=6) ngramMatcher = NgramMatcher(longest_match_only=False) cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher], symmetric_relations=False) sents = session.query(Sentence).all() cand_extractor.apply(sents, split=0) train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all() finder = FinderAcora() def find(array, word): return [i for i, each in enumerate(array) if each == word] def LF_product(c): if len(c.sensitive.get_attrib_tokens("words")) == len( find(c.sensitive.get_attrib_tokens("ner_tags"), "PRODUCT")): print "PRODUCT:" + c.sensitive.get_span() return "product"
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8, split=None, documents_titles=None, limit=None, page_size=10000): #create span and candidates logging.info("Starting candidates extraction ") subject_ne=predicate_resume['subject_ne'] object_ne=predicate_resume['object_ne'] session = SnorkelSession() CandidateSubclass = predicate_resume["candidate_subclass"] ngrams= Ngrams(n_max=7) subject_matcher = get_matcher(subject_ne) object_matcher = get_matcher(object_ne) cand_extractor = CandidateExtractor(CandidateSubclass, [ngrams, ngrams], [subject_matcher,object_matcher]) #skip sentences already extracted logging.info("Count candidates") sents_query_id = session.query(Sentence.id) candidates_count = session.query(CandidateSubclass).count() #logging.info("Delete span orphans") #delete_orphan_spans() if documents_titles==None and candidates_count>1 and clear==False: sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session) elif documents_titles != None: #delete candidates for test and dev logging.info("Deleting candidates") update_candidates_by_page_titles(predicate_resume,documents_titles, split) sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles) if limit is not None and documents_titles is None: sents_query_id=sents_query_id.limit(limit) sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id)) logging.info("Counting sentences") sents_count=sents_query.count() logging.info("Sents count"+str(sents_count)) print("Sents count"+str(sents_count)) if sents_count > page_size: page=page_size else: page=sents_count i=1 while(True): set_name="" if split == None: set_name="train" split2=0 else: set_name=str(split) split2=split logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name) sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all() logging.info("Extracting") if sents == None or len(sents) < 1 : break cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism) logging.info('\t\tcandidates extracted for %s', CandidateSubclass.__name__) i=i+1 clear=False logging.info("Finished candidates extraction ")
biomarker_ngrams = Ngrams(n_max=1) condition_ngrams = Ngrams(n_max=7) drug_ngrams = Ngrams(n_max=5) medium_ngrams = Ngrams(n_max=5) type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # Construct our Matchers bMatcher = matchers.getBiomarkerMatcher() cMatcher = matchers.getDiseaseMatcher() dMatcher = matchers.getDrugMatcher() mMatcher = matchers.getMediumMatcher() tMatcher = matchers.getTypeMatcher() # Building the CandidateExtractors candidate_extractor_BC = CandidateExtractor( BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor_BD = CandidateExtractor(BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher]) candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition) cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
# type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers bMatcher = matchers.getBiomarkerMatcher() cMatcher = matchers.getConditionMatcher() # dMatcher = matchers.getDrugMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors candidate_extractor_BC = CandidateExtractor( BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) # candidate_extractor_BD = CandidateExtractor(BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition) # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) # In[ ]:
from snorkel.candidates import Ngrams from snorkel.models import candidate_subclass #entity = candidate_subclass('entity', ['entity1', 'entity2']) import pandas as pd ROOT = 'data/dicts/' proteins = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1]) ngrams = Ngrams(n_max=1) from snorkel.matchers import DictionaryMatch longest_match_only = True dict_proteins = DictionaryMatch(d=proteins, ignore_case=True, longest_match_only=longest_match_only) #misc_matcher = MiscMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor ce = CandidateExtractor(entity, [ngrams, ngrams], [dict_proteins, dict_proteins], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Protein1 Training Candidates', session) for corpus_name in ['Protein Development']: corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: sentences.add(sentence) %time c = ce.extract(sentences, 'Protein1 Development Candidates', session) session.add(c) session.commit()
from snorkel.candidates import Ngrams ngrams = Ngrams(n_max=3) from snorkel.matchers import PersonMatcher from snorkel.matchers import OrganizationMatcher person_matcher = PersonMatcher(longest_match_only=True) org_matcher = OrganizationMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor ce = CandidateExtractor(Title, [ngrams, ngrams], [person_matcher, org_matcher], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Emails Training Candidates', session) print "Number of candidates:", len(c) session.add(c) session.commit() for corpus_name in ['Emails Development', 'Emails Test']: #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence)
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])
from final_candidates import GM, PM from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor from snorkel.models import Document, Sentence, candidate_subclass from snorkel.viewer import SentenceNgramViewer SPLIT_ON_DOCS = False ALL_DOCS = True # if true, create train dev and test. if false, push everything to dev cands. session = SnorkelSession() GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno']) gene_ngrams = Ngrams(n_max=5) pheno_ngrams = Ngrams(n_max=10) cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams], [GM, PM], symmetric_relations=True) print "Splitting Docs..." pathname = 'small_data/' if os.environ[ 'AGP_DATA_SIZE'] == 'small-data' else 'data/' with open(pathname + 'pmcids_400.pkl', 'rb') as f: sent_dicts = cPickle.load(f) train_ids, dev_ids, test_ids = set(sent_dicts['train']), set( sent_dicts['dev']), set(sent_dicts['test']) all_ids = train_ids.union(dev_ids).union(test_ids) # 40, 10, 10 train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set() train_docs, dev_docs, test_docs = set(), set(), set() docs = session.query(Document).order_by(Document.name).all() doc_sents = dict()
# In[7]: from snorkel.models import candidate_subclass LocationPer = candidate_subclass('LocationPer', ['location', 'person']) # Location = candidate_subclass('Location', ['location']) # In[8]: from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.matchers import PersonMatcher, LocationMatcher ngrams = Ngrams(n_max=3) person_matcher = PersonMatcher(longest_match_only=True) location_matcher = LocationMatcher(longest_match_only=True) cand_extractor = CandidateExtractor(LocationPer, [ngrams, ngrams], [person_matcher, location_matcher], symmetric_relations=False) # cand_extractor2 = CandidateExtractor(Location, # [ngrams], [location_matcher], # symmetric_relations=False) # In[9]: def number_of_people(sentence): active_sequence = False count = 0 for tag in sentence.ner_tags: if tag == 'LOCATION' and not active_sequence: active_sequence = True
# Setting extraction type -- should be a subfield in your data source extractions field! extraction_type = 'location' # Creating candidate class candidate_class, candidate_class_name = create_candidate_class(extraction_type) # Defining ngrams for candidates location_ngrams = Ngrams(n_max=3) # Define matchers geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc) spacy_location_matcher = LocationMatcher(longest_match_only=True) # Union matchers and create candidate extractor location_matcher = Union(geotext_location_matcher) cand_extractor = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher]) # Applying candidate extractor to each split (train, dev, test) # In[ ]: # Applying candidate extractor to each split for k, sents in enumerate([train_sents, dev_sents, test_sents]): cand_extractor.apply(sents, split=k, parallelism=parallelism) print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count()) # Add gold labels.