Beispiel #1
0
def cand_creation(df_subset):
    #Generating Candidates
    global T1
    T1 = candidate_subclass('T1', ['Features'])
    r = '^-?\d*.\d*'
    ngrams = Ngrams(n_max=300)
    regex_matcher = RegexMatchEach(rgx=r)
    cand_extractor = CandidateExtractor(T1, [ngrams], [regex_matcher])
    return T1, cand_extractor
Beispiel #2
0
def def_cand_extractor():
    """
    Defines a candidate extractor
    Make necessary changes to cand subclass, span, matcher and cand extractor
    :return: candExtractor, cSubClass
    """
    Text = candidate_subclass('Text', ['text'], values=['Positive', 'Negative', False])
    sent_span = SentCandidate()
    defaultMatcher = Matcher()
    cand_extractor = CandidateExtractor(Text, [sent_span], [defaultMatcher])
    return cand_extractor, Text
Beispiel #3
0
def get_segment_class_and_matcher(name,ngrams=None,non_comma_matcher=DictionaryMatch(d=[','],longest_match_only=True,reverse=True)):

	transition_prev_work=DictionaryMatch(d=['previous','earlier','past'],longest_match_only=True)

	if name.lower()=="background":
		Background = candidate_subclass('Background', ['background_cue'])
		transition_word=DictionaryMatch(d=['while','unlike','despite'],longest_match_only=True) 
		dict_background_matcher=DictionaryMatch(d=['previous work','traditionally','researchers'],longest_match_only=True) 
		excluded_dict_background_matcher=DictionaryMatch(d=['we','unlike','our'],longest_match_only=True,reverse=True) 
		non_comma_dict_background_matcher=CandidateExtractor(Background, [ngrams], [Intersection(non_comma_matcher,Union(dict_background_matcher,Intersection(transition_word,transition_prev_work)),excluded_dict_background_matcher)])
		return Background,non_comma_dict_background_matcher

	elif name.lower()=="purpose":
		Purpose=candidate_subclass('Purpose',['purpose_cue'])

		transition_regex_matcher=RegexMatchSpan(rgx="((^|\s)however.*$)|((^|\s)but(?!(also))*$)",longest_match_only=True)  # Correction: purpose 
		excluded_dict_purpose_matcher=SentenceMatch(d=['but also','but without','but sometimes'],longest_match_only=True,reverse=True)  # the parent sentence shall not include "but also"
		transition_matcher=Intersection(transition_regex_matcher,excluded_dict_purpose_matcher)
		comparative_degree_matcher=Intersection(RegexMatchSpan(rgx="(.*more.*than.*$)|(.*er than.*$)",longest_match_only=True),transition_prev_work)  # Correction: purpose 
		other_regex_matcher=RegexMatchSpan(rgx="(.*extend.*$)|(.*offer.*$)",longest_match_only=True)
		dict_purpose_matcher=DictionaryMatch(d=['in this paper','in the paper',' that can ','in this study','to examine','we examine','to investigate','implications'],longest_match_only=True) 
		non_comma_dict_purpose_matcher=CandidateExtractor(Purpose, [ngrams], [Intersection(non_comma_matcher,Union(comparative_degree_matcher,other_regex_matcher,dict_purpose_matcher,transition_matcher))]) #,intersection(excluded_dict_purpose_matcher,transition_regex_matcher)])
		return Purpose, non_comma_dict_purpose_matcher

	elif name.lower()=="mechanism":
		Mechanism = candidate_subclass('Mechanism', ['mechanism_cue']) 
		dict_mechanism_matcher=DictionaryMatch(d=['introduce','introduces','propose','proposes','we propose','we develop','approach'],longest_match_only=True) 
		non_comma_dict_mechanism_matcher=CandidateExtractor(Mechanism, [ngrams], [Intersection(non_comma_matcher,dict_mechanism_matcher)])
		return Mechanism, non_comma_dict_mechanism_matcher

	elif name.lower()=="method":
		Method = candidate_subclass('Method', ['method_cue'])
		dict_method_matcher=DictionaryMatch(d=['dataset','benchmark','experiment ','experiments',"empirical","participant","survey"," conduct"," analyze"],longest_match_only=True) 
		non_comma_dict_method_matcher=CandidateExtractor(Method, [ngrams], [Intersection(non_comma_matcher,dict_method_matcher)])
		return Method, non_comma_dict_method_matcher

	elif name.lower()=="finding":
		Finding = candidate_subclass('Finding', ['finding_cue'])
		dict_finding_matcher=DictionaryMatch(d=['show that','shows that','found','indicate','results','performance','find'],longest_match_only=True) 
		non_comma_dict_finding_matcher=CandidateExtractor(Finding, [ngrams], [Intersection(non_comma_matcher,dict_finding_matcher)])
		return Finding, non_comma_dict_finding_matcher

	elif name.lower()=="general":
		General=candidate_subclass('General',['general_cue'])
		general_extractor=CandidateExtractor(General,[ngrams],[non_comma_matcher])
		return General,general_extractor
		


		
    bucket = int(ind / docs_per_bucket)
    for s in doc.sentences:
        sents_split[bucket] += [s]
print("Number of buckets: (should have around ~100 buckets??)",
      len(sents_split))

from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import *
import datetime

Unigram = candidate_subclass('Unigram', ['unigram_cue'],
                             values=['PP', 'MN', 'NULL'])
ngrams = Ngrams(n_max=1)
ngram_matcher = NgramMatcher()
unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams],
                                               [ngram_matcher])

# from snorkel.lf_helpers import *
from snorkel.annotations import LabelAnnotator

# from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs
from LF.util_common_default_categorical_onset_1026 import *
# purpose_LFs,mechanism_LFs,null_LFs
print("total LF count",
      len(purpose_LFs + mechanism_LFs + null_LFs), "unique count",
      len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs",
      len(purpose_LFs), "mechanism_LFs", len(mechanism_LFs))
print("\n\npurpose_LFs\n", [lf.__name__ for lf in purpose_LFs])
print("\n\nmechanism_LFs\n", [lf.__name__ for lf in mechanism_LFs])
print("\n\nnull_LFs\n", [lf.__name__ for lf in null_LFs])
Beispiel #5
0
session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv',
                                      max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams],
                                    [virus_matcher, animals_matcher],
                                    nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords


# List to parenthetical
def ltp(x):
    return '(' + '|'.join(x) + ')'


# --------------------------------

# Positive LFs:
##### snorkeling


session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
        sents.add(s)


cand_extractor.apply(sents)

print("Number of candidates:", session.query(pairs).count())
Beispiel #7
0
    max_docs=350)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'],
                               values=[
                                   'person', 'job', 'event', 'place', 'date',
                                   'time', 'product', 'email', 'phone',
                                   'quantity', 'address', 'url', 'org', 'file',
                                   'password', False
                               ])
# generating candidates.
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only=False)

cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher],
                                    symmetric_relations=False)
sents = session.query(Sentence).all()
cand_extractor.apply(sents, split=0)
train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all()
finder = FinderAcora()


def find(array, word):
    return [i for i, each in enumerate(array) if each == word]


def LF_product(c):
    if len(c.sensitive.get_attrib_tokens("words")) == len(
            find(c.sensitive.get_attrib_tokens("ner_tags"), "PRODUCT")):
        print "PRODUCT:" + c.sensitive.get_span()
        return "product"
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8,
                              split=None, documents_titles=None, limit=None,
                              page_size=10000):
    #create span and candidates
    logging.info("Starting candidates extraction ")
    subject_ne=predicate_resume['subject_ne']
    object_ne=predicate_resume['object_ne']

    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]


    ngrams= Ngrams(n_max=7)
    subject_matcher = get_matcher(subject_ne)
    object_matcher = get_matcher(object_ne)
    cand_extractor = CandidateExtractor(CandidateSubclass,
                                        [ngrams, ngrams],
                                        [subject_matcher,object_matcher])

    #skip sentences already extracted
    logging.info("Count candidates")
    sents_query_id = session.query(Sentence.id)
    candidates_count = session.query(CandidateSubclass).count()
    #logging.info("Delete span orphans")
    #delete_orphan_spans()
    if documents_titles==None and candidates_count>1 and clear==False:
        sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session)
    elif documents_titles != None:
        #delete candidates for test and dev
        logging.info("Deleting candidates")
        update_candidates_by_page_titles(predicate_resume,documents_titles, split)
        sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles)

    if limit is not None and documents_titles is None:
        sents_query_id=sents_query_id.limit(limit)


    sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id))


    logging.info("Counting sentences")
    sents_count=sents_query.count()
    logging.info("Sents count"+str(sents_count))
    print("Sents count"+str(sents_count))
    if sents_count > page_size:
        page=page_size
    else:
        page=sents_count
    i=1
    while(True):
        set_name=""
        if split == None:
            set_name="train"
            split2=0
        else:
            set_name=str(split)
            split2=split

        logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name)
        sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all()
        logging.info("Extracting")
        if sents == None or len(sents) < 1 :
            break
        cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism)
        logging.info('\t\tcandidates extracted for %s',  CandidateSubclass.__name__)
        i=i+1
        clear=False
    logging.info("Finished candidates extraction ")
Beispiel #9
0
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
drug_ngrams = Ngrams(n_max=5)
medium_ngrams = Ngrams(n_max=5)
type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getDiseaseMatcher()
dMatcher = matchers.getDrugMatcher()
mMatcher = matchers.getMediumMatcher()
tMatcher = matchers.getTypeMatcher()

# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(
    BiomarkerCondition, [biomarker_ngrams, condition_ngrams],
    [bMatcher, cMatcher])
candidate_extractor_BD = CandidateExtractor(BiomarkerDrug,
                                            [biomarker_ngrams, drug_ngrams],
                                            [bMatcher, dMatcher])
candidate_extractor_BM = CandidateExtractor(BiomarkerMedium,
                                            [biomarker_ngrams, medium_ngrams],
                                            [bMatcher, mMatcher])
candidate_extractor_BT = CandidateExtractor(BiomarkerType,
                                            [biomarker_ngrams, type_ngrams],
                                            [bMatcher, tMatcher])

# List of Candidate Sets for each relation type: [train, dev, test]
cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)
cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
# type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
# # level_ngrams = Ngrams(n_max=1)
# unit_ngrams = Ngrams(n_max=1)

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getConditionMatcher()
# dMatcher = matchers.getDrugMatcher()
# mMatcher = matchers.getMediumMatcher()
# tMatcher = matchers.getTypeMatcher()
# lMatcher = matchers.getLevelMatcher()
# uMatcher = matchers.getUnitMatcher()

# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(
    BiomarkerCondition, [biomarker_ngrams, condition_ngrams],
    [bMatcher, cMatcher])
# candidate_extractor_BD = CandidateExtractor(BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher])
# candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
# candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
# candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

# List of Candidate Sets for each relation type: [train, dev, test]
cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)
# cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
# cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
# cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
# cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

# In[ ]:
Beispiel #11
0
from snorkel.candidates import Ngrams
from snorkel.models import candidate_subclass
#entity = candidate_subclass('entity', ['entity1', 'entity2'])
import pandas as pd
ROOT = 'data/dicts/'
proteins   = set(pd.read_csv(ROOT + 'protein_names.csv', header=None, index_col=0, encoding='utf-8').dropna()[1])
ngrams = Ngrams(n_max=1)
from snorkel.matchers import DictionaryMatch

longest_match_only = True
dict_proteins = DictionaryMatch(d=proteins, ignore_case=True, 
                                longest_match_only=longest_match_only)
#misc_matcher = MiscMatcher(longest_match_only=True)
from snorkel.candidates import CandidateExtractor
ce = CandidateExtractor(entity, [ngrams, ngrams], [dict_proteins, dict_proteins],
                        symmetric_relations=False, nested_relations=False, self_relations=False)

%time c = ce.extract(sentences, 'Protein1 Training Candidates', session)



for corpus_name in ['Protein Development']:
    corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    sentences = set()
    for document in corpus:
        for sentence in document.sentences:
            sentences.add(sentence)
    
    %time c = ce.extract(sentences, 'Protein1 Development Candidates', session)
    session.add(c)
session.commit()
from snorkel.candidates import Ngrams

ngrams = Ngrams(n_max=3)

from snorkel.matchers import PersonMatcher

from snorkel.matchers import OrganizationMatcher

person_matcher = PersonMatcher(longest_match_only=True)

org_matcher = OrganizationMatcher(longest_match_only=True)

from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Title, [ngrams, ngrams], [person_matcher, org_matcher],
                        symmetric_relations=False, nested_relations=False, self_relations=False)
						
%time c = ce.extract(sentences, 'Emails Training Candidates', session)
print "Number of candidates:", len(c)

session.add(c)
session.commit()

for corpus_name in ['Emails Development', 'Emails Test']:
    #corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    sentences = set()
    for document in corpus:
        for sentence in document.sentences:
            if number_of_people(sentence) < 5:
                sentences.add(sentence)
    
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])
Beispiel #14
0
from final_candidates import GM, PM
from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor
from snorkel.models import Document, Sentence, candidate_subclass
from snorkel.viewer import SentenceNgramViewer

SPLIT_ON_DOCS = False
ALL_DOCS = True  # if true, create train dev and test. if false, push everything to dev cands.

session = SnorkelSession()

GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno'])

gene_ngrams = Ngrams(n_max=5)
pheno_ngrams = Ngrams(n_max=10)
cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams],
                                    [GM, PM],
                                    symmetric_relations=True)

print "Splitting Docs..."
pathname = 'small_data/' if os.environ[
    'AGP_DATA_SIZE'] == 'small-data' else 'data/'
with open(pathname + 'pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(
    sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
# 40, 10, 10
train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
train_docs, dev_docs, test_docs = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()
doc_sents = dict()
# In[7]:

from snorkel.models import candidate_subclass
LocationPer = candidate_subclass('LocationPer', ['location', 'person'])
# Location = candidate_subclass('Location', ['location'])

# In[8]:

from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher, LocationMatcher

ngrams = Ngrams(n_max=3)
person_matcher = PersonMatcher(longest_match_only=True)
location_matcher = LocationMatcher(longest_match_only=True)
cand_extractor = CandidateExtractor(LocationPer, [ngrams, ngrams],
                                    [person_matcher, location_matcher],
                                    symmetric_relations=False)

# cand_extractor2 = CandidateExtractor(Location,
#                                     [ngrams], [location_matcher],
#                                     symmetric_relations=False)

# In[9]:


def number_of_people(sentence):
    active_sequence = False
    count = 0
    for tag in sentence.ner_tags:
        if tag == 'LOCATION' and not active_sequence:
            active_sequence = True
# Setting extraction type -- should be a subfield in your data source extractions field!
extraction_type = 'location'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Defining ngrams for candidates
location_ngrams = Ngrams(n_max=3)

# Define matchers
geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc)
spacy_location_matcher = LocationMatcher(longest_match_only=True)

# Union matchers and create candidate extractor
location_matcher = Union(geotext_location_matcher)
cand_extractor   = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher])


# Applying candidate extractor to each split (train, dev, test)

# In[ ]:


# Applying candidate extractor to each split
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=k, parallelism=parallelism)
    print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count())


# Add gold labels.