def cand_creation(df_subset): #Generating Candidates global T1 T1 = candidate_subclass('T1', ['Features']) r = '^-?\d*.\d*' ngrams = Ngrams(n_max=300) regex_matcher = RegexMatchEach(rgx=r) cand_extractor = CandidateExtractor(T1, [ngrams], [regex_matcher]) return T1, cand_extractor
sents_split = defaultdict(lambda: []) for ind, doc in enumerate(docs): bucket = int(ind / docs_per_bucket) for s in doc.sentences: sents_split[bucket] += [s] print("Number of buckets: (should have around ~100 buckets??)", len(sents_split)) from snorkel.models import candidate_subclass from snorkel.candidates import Ngrams, CandidateExtractor from snorkel.matchers import * import datetime Unigram = candidate_subclass('Unigram', ['unigram_cue'], values=['PP', 'MN', 'NULL']) ngrams = Ngrams(n_max=1) ngram_matcher = NgramMatcher() unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams], [ngram_matcher]) # from snorkel.lf_helpers import * from snorkel.annotations import LabelAnnotator # from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs from LF.util_common_default_categorical_onset_1026 import * # purpose_LFs,mechanism_LFs,null_LFs print("total LF count", len(purpose_LFs + mechanism_LFs + null_LFs), "unique count", len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs", len(purpose_LFs), "mechanism_LFs", len(mechanism_LFs)) print("\n\npurpose_LFs\n", [lf.__name__ for lf in purpose_LFs])
doc_preprocessor = TSVDocPreprocessor( '/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv', max_docs=350) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) Sensitive = candidate_subclass('Sensitive', ['sensitive'], values=[ 'person', 'job', 'event', 'place', 'date', 'time', 'product', 'email', 'phone', 'quantity', 'address', 'url', 'org', 'file', 'password', False ]) # generating candidates. ngrams = Ngrams(n_max=6) ngramMatcher = NgramMatcher(longest_match_only=False) cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher], symmetric_relations=False) sents = session.query(Sentence).all() cand_extractor.apply(sents, split=0) train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all() finder = FinderAcora() def find(array, word): return [i for i, each in enumerate(array) if each == word] def LF_product(c):
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8, split=None, documents_titles=None, limit=None, page_size=10000): #create span and candidates logging.info("Starting candidates extraction ") subject_ne=predicate_resume['subject_ne'] object_ne=predicate_resume['object_ne'] session = SnorkelSession() CandidateSubclass = predicate_resume["candidate_subclass"] ngrams= Ngrams(n_max=7) subject_matcher = get_matcher(subject_ne) object_matcher = get_matcher(object_ne) cand_extractor = CandidateExtractor(CandidateSubclass, [ngrams, ngrams], [subject_matcher,object_matcher]) #skip sentences already extracted logging.info("Count candidates") sents_query_id = session.query(Sentence.id) candidates_count = session.query(CandidateSubclass).count() #logging.info("Delete span orphans") #delete_orphan_spans() if documents_titles==None and candidates_count>1 and clear==False: sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session) elif documents_titles != None: #delete candidates for test and dev logging.info("Deleting candidates") update_candidates_by_page_titles(predicate_resume,documents_titles, split) sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles) if limit is not None and documents_titles is None: sents_query_id=sents_query_id.limit(limit) sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id)) logging.info("Counting sentences") sents_count=sents_query.count() logging.info("Sents count"+str(sents_count)) print("Sents count"+str(sents_count)) if sents_count > page_size: page=page_size else: page=sents_count i=1 while(True): set_name="" if split == None: set_name="train" split2=0 else: set_name=str(split) split2=split logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name) sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all() logging.info("Extracting") if sents == None or len(sents) < 1 : break cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism) logging.info('\t\tcandidates extracted for %s', CandidateSubclass.__name__) i=i+1 clear=False logging.info("Finished candidates extraction ")
raise Exception('ID <{0}> not found in any id set'.format( doc.name)) #---------------------- # Candidate Extraction #---------------------- # Defining the Candidate Schemas BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition']) BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug']) BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium']) # N-grams: the probabilistic search space of our entities biomarker_ngrams = Ngrams(n_max=1) condition_ngrams = Ngrams(n_max=7) drug_ngrams = Ngrams(n_max=5) medium_ngrams = Ngrams(n_max=5) type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # Construct our Matchers bMatcher = matchers.getBiomarkerMatcher() cMatcher = matchers.getDiseaseMatcher() dMatcher = matchers.getDrugMatcher() mMatcher = matchers.getMediumMatcher() tMatcher = matchers.getTypeMatcher() # Building the CandidateExtractors candidate_extractor_BC = CandidateExtractor( BiomarkerCondition, [biomarker_ngrams, condition_ngrams],
def setUpClass(cls): with open(DATA_PATH + 'CDR_TestSet_sents.pkl', 'rb') as f: cls.CDR_sents = cPickle.load(f) cls.sp = SentenceParser() cls.ngrams = Ngrams()
def main(argv): parser = argparse.ArgumentParser(description='Process some arguments.') parser.add_argument('--dbPath', type=str, default=os.getcwd() + os.sep + 'snorkel.db', help='the path of snorkel database') parser.add_argument( '--lfPath', type=str, default=os.getcwd() + os.sep + 'util_default.py', help='the path of util.py file where labelling functions were defined') args = parser.parse_args() # Connect to db, and get session util_module = imp.load_source("module.name", args.lfPath) train_doc_breakdown_map = dict( ) # maps doc_id into a dict of ["Background", "Purpose", "Mechanism", "Method", "Finding"] test_doc_breakdown_map = dict() SnorkelSession = create_session_with_conn("sqlite:///" + args.dbPath) session = SnorkelSession() print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) sents = session.query(Sentence).all() n_max_corpus = 0 for sent in sents: n_max_corpus = max(n_max_corpus, len(sent.words)) print("The longest sentence has " + str(n_max_corpus) + " tokens.") ngrams = Ngrams(n_max=n_max_corpus) # from util import number_of_people docs = session.query(Document).all() train_sents = set() dev_sents = set() test_sents = set() for i, doc in enumerate(docs): for s in doc.sentences: if i % 10 == 8 and "cscw18" != doc.name[:6]: dev_sents.add(s) elif "cscw18" == doc.name[: 6]: # replace the earlier 10% test documents as cscw'18 annotation guideline 10 examples test_sents.add(s) elif "cscw18" != doc.name[:6]: train_sents.add(s) General, general_extractor = util_module.get_segment_class_and_matcher( "General", ngrams) general_cands = extract_and_display( train_sents, dev_sents, test_sents, session, general_extractor, General, "General", train_doc_breakdown_map=train_doc_breakdown_map, test_doc_breakdown_map=test_doc_breakdown_map) input("Finished general ") # load segment_candidate_class and corresponding_matcher, e.g. (Background, non_comma_dict_background_matcher) Background, background_matcher = util_module.get_segment_class_and_matcher( "Background", ngrams) background_cands = extract_and_display( train_sents, dev_sents, test_sents, session, background_matcher, Background, "Background", train_doc_breakdown_map=train_doc_breakdown_map, test_doc_breakdown_map=test_doc_breakdown_map) debug_sess_eval(session, Background, background_matcher)
#---------------------- # Candidate Extraction #---------------------- # Defining the Candidate Schemas # BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition']) # BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug']) # BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium']) # BiomarkerType = candidate_subclass('BiomarkerType', ['biomarker', 'typ3']) # # BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit']) #can eventually add MEASUREMENT and COHORT SIZE among other entities # N-grams: the probabilistic search space of our entities biomarker_ngrams = Ngrams(n_max=1) condition_ngrams = Ngrams(n_max=7) # drug_ngrams = Ngrams(n_max=5) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers bMatcher = matchers.getBiomarkerMatcher() cMatcher = matchers.getConditionMatcher() # dMatcher = matchers.getDrugMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher()
for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence) from snorkel.models import candidate_subclass Title = candidate_subclass('Person_Org', ['person1', 'organization']) from snorkel.candidates import Ngrams ngrams = Ngrams(n_max=3) from snorkel.matchers import PersonMatcher from snorkel.matchers import OrganizationMatcher person_matcher = PersonMatcher(longest_match_only=True) org_matcher = OrganizationMatcher(longest_match_only=True) from snorkel.candidates import CandidateExtractor ce = CandidateExtractor(Title, [ngrams, ngrams], [person_matcher, org_matcher], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Emails Training Candidates', session)
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams, cand1Matcher, cand2Matcher, model_name, output_file_name, corpus_parser): print "Started" session = SnorkelSession() # The following line is for testing only. Feel free to ignore it. candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2]) sentences = set() docs = session.query(Document).order_by(Document.name).all() for doc in docs: for s in doc.sentences: sentences.add(s) cand_1_ngrams = Ngrams(n_max=cand1_ngrams) # condition_ngrams = Ngrams(n_max=7) cand_2_ngrams = Ngrams(n_max=cand2_ngrams) # medium_ngrams = Ngrams(n_max=5) # type_ngrams = Ngrams(n_max=5) # <--- Q: should we cut these down? # # level_ngrams = Ngrams(n_max=1) # unit_ngrams = Ngrams(n_max=1) # Construct our Matchers # cMatcher = matchers.getConditionMatcher() # mMatcher = matchers.getMediumMatcher() # tMatcher = matchers.getTypeMatcher() # lMatcher = matchers.getLevelMatcher() # uMatcher = matchers.getUnitMatcher() # Building the CandidateExtractors # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher]) candidate_extractor = CandidateExtractor(candidate_pair, [cand_1_ngrams, cand_2_ngrams], [cand1Matcher, cand2Matcher]) # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher]) # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher]) # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher]) # List of Candidate Sets for each relation type: [train, dev, test] candidate_extractor.apply(sentences, split=4, clear=True) cands = session.query(candidate_pair).filter( candidate_pair.split == 4).order_by(candidate_pair.id).all() session.commit() # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug) # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium) # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType) # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit) if (len(cands)) == 0: print "No Candidates Found" return if (pairing_name == 'BiomarkerCondition'): # session.rollback() # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1]) add_adj_candidate_BC(session, candidate_pair, cands, 4) # fix_specificity(session, BiomarkerCondition, cands_BC[1]) # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count() session.commit() lstm = reRNN(seed=1701, n_threads=None) lstm.load(model_name) predictions = lstm.predictions(cands) output_file = open(output_file_name, 'wb') import csv csvWriter = csv.writer(output_file) csvWriter.writerow( ['doc_id', 'sentence', candidate1, candidate2, 'prediction']) for i in range(len(cands)): doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:] sentence_string = cands[i].get_parent().text cand_1_string = cands[i].get_contexts()[0].get_span() cand_2_string = cands[i].get_contexts()[1].get_span() prediction = predictions[i] csvWriter.writerow([ unidecode(doc_string), unidecode(sentence_string), unidecode(cand_1_string), unidecode(cand_2_string), prediction ])
from snorkel import SnorkelSession from snorkel.matchers import DictionaryMatch from final_candidates import GM, PM from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor from snorkel.models import Document, Sentence, candidate_subclass from snorkel.viewer import SentenceNgramViewer SPLIT_ON_DOCS = False ALL_DOCS = True # if true, create train dev and test. if false, push everything to dev cands. session = SnorkelSession() GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno']) gene_ngrams = Ngrams(n_max=5) pheno_ngrams = Ngrams(n_max=10) cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams], [GM, PM], symmetric_relations=True) print "Splitting Docs..." pathname = 'small_data/' if os.environ[ 'AGP_DATA_SIZE'] == 'small-data' else 'data/' with open(pathname + 'pmcids_400.pkl', 'rb') as f: sent_dicts = cPickle.load(f) train_ids, dev_ids, test_ids = set(sent_dicts['train']), set( sent_dicts['dev']), set(sent_dicts['test']) all_ids = train_ids.union(dev_ids).union(test_ids) # 40, 10, 10 train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
# In[12]: from snorkel.candidates import Ngrams from snorkel.candidates import CandidateExtractor from dataset_utils import create_candidate_class, LocationMatcher, fast_loc from snorkel.matchers import Union, LambdaFunctionMatcher # Setting extraction type -- should be a subfield in your data source extractions field! extraction_type = 'location' # Creating candidate class candidate_class, candidate_class_name = create_candidate_class(extraction_type) # Defining ngrams for candidates location_ngrams = Ngrams(n_max=3) # Define matchers geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc) spacy_location_matcher = LocationMatcher(longest_match_only=True) # Union matchers and create candidate extractor location_matcher = Union(geotext_location_matcher) cand_extractor = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher]) # Applying candidate extractor to each split (train, dev, test) # In[ ]: