test_sents = set() for i, doc in enumerate(docs): for s in doc.sentences: if i in X_dev: dev_sents.add(s) elif i in X_test: test_sents.add(s) else: train_sents.add(s) # Number of sentences per set print("Sentences per train, dev, and test sets:", len(train_sents), len(dev_sents), len(test_sents)) # Candidate extraction and define as train, dev, and test for i, sents in enumerate([train_sents, dev_sents, test_sents]): cand_extractor.apply(sents, split=i) train_cands = session.query(VirusHost).filter( VirusHost.split == 0).order_by(VirusHost.id).all() dev_cands = session.query(VirusHost).filter(VirusHost.split == 1).order_by( VirusHost.id).all() test_cands = session.query(VirusHost).filter( VirusHost.split == 2).order_by(VirusHost.id).all() # Apply labeler to all sets L_train = labeler.apply(split=0) L_dev = labeler.apply(split=1) L_test = labeler.apply(split=2) # Load gold labels missed = load_external_labels(session,
regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs): for s in doc.sentences: sents.add(s) cand_extractor.apply(sents) print("Number of candidates:", session.query(pairs).count()) labeler = LabelAnnotator(lfs=LFs) L_train = labeler.apply() print(L_train.lf_stats(session)) # generative model, training_marginals are probabilistic training labels gen_model = GenerativeModel() gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8, split=None, documents_titles=None, limit=None, page_size=10000): #create span and candidates logging.info("Starting candidates extraction ") subject_ne=predicate_resume['subject_ne'] object_ne=predicate_resume['object_ne'] session = SnorkelSession() CandidateSubclass = predicate_resume["candidate_subclass"] ngrams= Ngrams(n_max=7) subject_matcher = get_matcher(subject_ne) object_matcher = get_matcher(object_ne) cand_extractor = CandidateExtractor(CandidateSubclass, [ngrams, ngrams], [subject_matcher,object_matcher]) #skip sentences already extracted logging.info("Count candidates") sents_query_id = session.query(Sentence.id) candidates_count = session.query(CandidateSubclass).count() #logging.info("Delete span orphans") #delete_orphan_spans() if documents_titles==None and candidates_count>1 and clear==False: sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session) elif documents_titles != None: #delete candidates for test and dev logging.info("Deleting candidates") update_candidates_by_page_titles(predicate_resume,documents_titles, split) sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles) if limit is not None and documents_titles is None: sents_query_id=sents_query_id.limit(limit) sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id)) logging.info("Counting sentences") sents_count=sents_query.count() logging.info("Sents count"+str(sents_count)) print("Sents count"+str(sents_count)) if sents_count > page_size: page=page_size else: page=sents_count i=1 while(True): set_name="" if split == None: set_name="train" split2=0 else: set_name=str(split) split2=split logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name) sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all() logging.info("Extracting") if sents == None or len(sents) < 1 : break cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism) logging.info('\t\tcandidates extracted for %s', CandidateSubclass.__name__) i=i+1 clear=False logging.info("Finished candidates extraction ")
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])
dev_docs.add(name) dev_sents.add(s) elif name in test_ids: test_docs.add(name) test_sents.add(s) else: raise Exception('ID <{0}> not found in any id set'.format( doc.name)) print "Docs Split" print "Extracting Candidates..." if SPLIT_ON_DOCS: for split, sents in doc_sents.iteritems(): cand_extractor.apply(sents, split=split, parallelism=multiprocessing.cpu_count()) all_cands = session.query(GenePhenoPair).filter( GenePhenoPair.split < len(doc_sents)).all() print "Number of candidates:", len(all_cands) else: if ALL_DOCS: cand_extractor.apply(train_sents, split=0, parallelism=multiprocessing.cpu_count()) train_cands = session.query(GenePhenoPair).filter( GenePhenoPair.split == 0).all() cand_extractor.apply(dev_sents, split=1, parallelism=multiprocessing.cpu_count()) dev_cands = session.query(GenePhenoPair).filter(
geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc) spacy_location_matcher = LocationMatcher(longest_match_only=True) # Union matchers and create candidate extractor location_matcher = Union(geotext_location_matcher) cand_extractor = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher]) # Applying candidate extractor to each split (train, dev, test) # In[ ]: # Applying candidate extractor to each split for k, sents in enumerate([train_sents, dev_sents, test_sents]): cand_extractor.apply(sents, split=k, parallelism=parallelism) print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count()) # Add gold labels. # In[ ]: from dataset_utils import get_gold_labels_from_meta # Adding dev gold labels using dictionary missed_dev = get_gold_labels_from_meta(session, candidate_class, extraction_type, 1, annotator='gold', gold_dict=None) # Adding test gold labels using dictionary