Ejemplo n.º 1
0
    test_sents = set()
    for i, doc in enumerate(docs):
        for s in doc.sentences:
            if i in X_dev:
                dev_sents.add(s)
            elif i in X_test:
                test_sents.add(s)
            else:
                train_sents.add(s)
    # Number of sentences per set
    print("Sentences per train, dev, and test sets:", len(train_sents),
          len(dev_sents), len(test_sents))

    # Candidate extraction and define as train, dev, and test
    for i, sents in enumerate([train_sents, dev_sents, test_sents]):
        cand_extractor.apply(sents, split=i)

    train_cands = session.query(VirusHost).filter(
        VirusHost.split == 0).order_by(VirusHost.id).all()
    dev_cands = session.query(VirusHost).filter(VirusHost.split == 1).order_by(
        VirusHost.id).all()
    test_cands = session.query(VirusHost).filter(
        VirusHost.split == 2).order_by(VirusHost.id).all()

    # Apply labeler to all sets
    L_train = labeler.apply(split=0)
    L_dev = labeler.apply(split=1)
    L_test = labeler.apply(split=2)

    # Load gold labels
    missed = load_external_labels(session,
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)


cand_extractor.apply(sents)

print("Number of candidates:", session.query(pairs).count())


labeler = LabelAnnotator(lfs=LFs)

L_train = labeler.apply()

print(L_train.lf_stats(session))


# generative model, training_marginals are probabilistic training labels
gen_model = GenerativeModel()
gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8,
                              split=None, documents_titles=None, limit=None,
                              page_size=10000):
    #create span and candidates
    logging.info("Starting candidates extraction ")
    subject_ne=predicate_resume['subject_ne']
    object_ne=predicate_resume['object_ne']

    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]


    ngrams= Ngrams(n_max=7)
    subject_matcher = get_matcher(subject_ne)
    object_matcher = get_matcher(object_ne)
    cand_extractor = CandidateExtractor(CandidateSubclass,
                                        [ngrams, ngrams],
                                        [subject_matcher,object_matcher])

    #skip sentences already extracted
    logging.info("Count candidates")
    sents_query_id = session.query(Sentence.id)
    candidates_count = session.query(CandidateSubclass).count()
    #logging.info("Delete span orphans")
    #delete_orphan_spans()
    if documents_titles==None and candidates_count>1 and clear==False:
        sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session)
    elif documents_titles != None:
        #delete candidates for test and dev
        logging.info("Deleting candidates")
        update_candidates_by_page_titles(predicate_resume,documents_titles, split)
        sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles)

    if limit is not None and documents_titles is None:
        sents_query_id=sents_query_id.limit(limit)


    sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id))


    logging.info("Counting sentences")
    sents_count=sents_query.count()
    logging.info("Sents count"+str(sents_count))
    print("Sents count"+str(sents_count))
    if sents_count > page_size:
        page=page_size
    else:
        page=sents_count
    i=1
    while(True):
        set_name=""
        if split == None:
            set_name="train"
            split2=0
        else:
            set_name=str(split)
            split2=split

        logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name)
        sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all()
        logging.info("Extracting")
        if sents == None or len(sents) < 1 :
            break
        cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism)
        logging.info('\t\tcandidates extracted for %s',  CandidateSubclass.__name__)
        i=i+1
        clear=False
    logging.info("Finished candidates extraction ")
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])
Ejemplo n.º 5
0
            dev_docs.add(name)
            dev_sents.add(s)
        elif name in test_ids:
            test_docs.add(name)
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(
                doc.name))

print "Docs Split"
print "Extracting Candidates..."

if SPLIT_ON_DOCS:
    for split, sents in doc_sents.iteritems():
        cand_extractor.apply(sents,
                             split=split,
                             parallelism=multiprocessing.cpu_count())
    all_cands = session.query(GenePhenoPair).filter(
        GenePhenoPair.split < len(doc_sents)).all()
    print "Number of candidates:", len(all_cands)
else:
    if ALL_DOCS:
        cand_extractor.apply(train_sents,
                             split=0,
                             parallelism=multiprocessing.cpu_count())
        train_cands = session.query(GenePhenoPair).filter(
            GenePhenoPair.split == 0).all()
        cand_extractor.apply(dev_sents,
                             split=1,
                             parallelism=multiprocessing.cpu_count())
        dev_cands = session.query(GenePhenoPair).filter(
geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc)
spacy_location_matcher = LocationMatcher(longest_match_only=True)

# Union matchers and create candidate extractor
location_matcher = Union(geotext_location_matcher)
cand_extractor   = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher])


# Applying candidate extractor to each split (train, dev, test)

# In[ ]:


# Applying candidate extractor to each split
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=k, parallelism=parallelism)
    print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count())


# Add gold labels.

# In[ ]:


from dataset_utils import get_gold_labels_from_meta


# Adding dev gold labels using dictionary
missed_dev = get_gold_labels_from_meta(session, candidate_class, extraction_type, 1, annotator='gold', gold_dict=None)

# Adding test gold labels using dictionary