Beispiel #1
0
def cand_creation(df_subset):
    #Generating Candidates
    global T1
    T1 = candidate_subclass('T1', ['Features'])
    r = '^-?\d*.\d*'
    ngrams = Ngrams(n_max=300)
    regex_matcher = RegexMatchEach(rgx=r)
    cand_extractor = CandidateExtractor(T1, [ngrams], [regex_matcher])
    return T1, cand_extractor
sents_split = defaultdict(lambda: [])
for ind, doc in enumerate(docs):
    bucket = int(ind / docs_per_bucket)
    for s in doc.sentences:
        sents_split[bucket] += [s]
print("Number of buckets: (should have around ~100 buckets??)",
      len(sents_split))

from snorkel.models import candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import *
import datetime

Unigram = candidate_subclass('Unigram', ['unigram_cue'],
                             values=['PP', 'MN', 'NULL'])
ngrams = Ngrams(n_max=1)
ngram_matcher = NgramMatcher()
unigram_segment_extractor = CandidateExtractor(Unigram, [ngrams],
                                               [ngram_matcher])

# from snorkel.lf_helpers import *
from snorkel.annotations import LabelAnnotator

# from LF.util_common_default_categorical import purpose_LFs,mechanism_LFs,null_LFs
from LF.util_common_default_categorical_onset_1026 import *
# purpose_LFs,mechanism_LFs,null_LFs
print("total LF count",
      len(purpose_LFs + mechanism_LFs + null_LFs), "unique count",
      len(set(purpose_LFs + mechanism_LFs + null_LFs)), "purpose_LFs",
      len(purpose_LFs), "mechanism_LFs", len(mechanism_LFs))
print("\n\npurpose_LFs\n", [lf.__name__ for lf in purpose_LFs])
Beispiel #3
0
doc_preprocessor = TSVDocPreprocessor(
    '/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv',
    max_docs=350)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'],
                               values=[
                                   'person', 'job', 'event', 'place', 'date',
                                   'time', 'product', 'email', 'phone',
                                   'quantity', 'address', 'url', 'org', 'file',
                                   'password', False
                               ])
# generating candidates.
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only=False)

cand_extractor = CandidateExtractor(Sensitive, [ngrams], [ngramMatcher],
                                    symmetric_relations=False)
sents = session.query(Sentence).all()
cand_extractor.apply(sents, split=0)
train_cands = session.query(Sensitive).filter(Sensitive.split == 0).all()
finder = FinderAcora()


def find(array, word):
    return [i for i, each in enumerate(array) if each == word]


def LF_product(c):
def extract_binary_candidates(predicate_resume, clear=False, parallelism=8,
                              split=None, documents_titles=None, limit=None,
                              page_size=10000):
    #create span and candidates
    logging.info("Starting candidates extraction ")
    subject_ne=predicate_resume['subject_ne']
    object_ne=predicate_resume['object_ne']

    session = SnorkelSession()
    CandidateSubclass = predicate_resume["candidate_subclass"]


    ngrams= Ngrams(n_max=7)
    subject_matcher = get_matcher(subject_ne)
    object_matcher = get_matcher(object_ne)
    cand_extractor = CandidateExtractor(CandidateSubclass,
                                        [ngrams, ngrams],
                                        [subject_matcher,object_matcher])

    #skip sentences already extracted
    logging.info("Count candidates")
    sents_query_id = session.query(Sentence.id)
    candidates_count = session.query(CandidateSubclass).count()
    #logging.info("Delete span orphans")
    #delete_orphan_spans()
    if documents_titles==None and candidates_count>1 and clear==False:
        sents_query_id = get_sentences_ids_not_extracted(predicate_resume, session)
    elif documents_titles != None:
        #delete candidates for test and dev
        logging.info("Deleting candidates")
        update_candidates_by_page_titles(predicate_resume,documents_titles, split)
        sents_query_id=get_sentences_ids_by_title_not_extracted(predicate_resume,session,documents_titles)

    if limit is not None and documents_titles is None:
        sents_query_id=sents_query_id.limit(limit)


    sents_query=session.query(Sentence).filter(Sentence.id.in_(sents_query_id))


    logging.info("Counting sentences")
    sents_count=sents_query.count()
    logging.info("Sents count"+str(sents_count))
    print("Sents count"+str(sents_count))
    if sents_count > page_size:
        page=page_size
    else:
        page=sents_count
    i=1
    while(True):
        set_name=""
        if split == None:
            set_name="train"
            split2=0
        else:
            set_name=str(split)
            split2=split

        logging.info('\tQuering sentences from %s to %s, in set \'%s\'', (page*(i-1)), page*i, set_name)
        sents=sents_query.order_by(Sentence.id).slice((page*(i-1)), page*i).all()
        logging.info("Extracting")
        if sents == None or len(sents) < 1 :
            break
        cand_extractor.apply(sents, split=split2, clear=clear, progress_bar=False, parallelism=parallelism)
        logging.info('\t\tcandidates extracted for %s',  CandidateSubclass.__name__)
        i=i+1
        clear=False
    logging.info("Finished candidates extraction ")
Beispiel #5
0
            raise Exception('ID <{0}> not found in any id set'.format(
                doc.name))

#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
BiomarkerCondition = candidate_subclass('BiomarkerCondition',
                                        ['biomarker', 'condition'])
BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
BiomarkerMedium = candidate_subclass('BiomarkerMedium',
                                     ['biomarker', 'medium'])

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
drug_ngrams = Ngrams(n_max=5)
medium_ngrams = Ngrams(n_max=5)
type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getDiseaseMatcher()
dMatcher = matchers.getDrugMatcher()
mMatcher = matchers.getMediumMatcher()
tMatcher = matchers.getTypeMatcher()

# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(
    BiomarkerCondition, [biomarker_ngrams, condition_ngrams],
Beispiel #6
0
 def setUpClass(cls):
     with open(DATA_PATH + 'CDR_TestSet_sents.pkl', 'rb') as f:
         cls.CDR_sents = cPickle.load(f)
     cls.sp = SentenceParser()
     cls.ngrams = Ngrams()
Beispiel #7
0
def main(argv):
    parser = argparse.ArgumentParser(description='Process some arguments.')
    parser.add_argument('--dbPath',
                        type=str,
                        default=os.getcwd() + os.sep + 'snorkel.db',
                        help='the path of snorkel database')
    parser.add_argument(
        '--lfPath',
        type=str,
        default=os.getcwd() + os.sep + 'util_default.py',
        help='the path of util.py file where labelling functions were defined')

    args = parser.parse_args()

    # Connect to db, and get session

    util_module = imp.load_source("module.name", args.lfPath)
    train_doc_breakdown_map = dict(
    )  # maps doc_id into a dict of ["Background", "Purpose", "Mechanism", "Method", "Finding"]
    test_doc_breakdown_map = dict()

    SnorkelSession = create_session_with_conn("sqlite:///" + args.dbPath)
    session = SnorkelSession()

    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())

    sents = session.query(Sentence).all()
    n_max_corpus = 0
    for sent in sents:
        n_max_corpus = max(n_max_corpus, len(sent.words))

    print("The longest sentence has " + str(n_max_corpus) + " tokens.")

    ngrams = Ngrams(n_max=n_max_corpus)

    # from util import number_of_people

    docs = session.query(Document).all()

    train_sents = set()
    dev_sents = set()
    test_sents = set()

    for i, doc in enumerate(docs):
        for s in doc.sentences:
            if i % 10 == 8 and "cscw18" != doc.name[:6]:
                dev_sents.add(s)
            elif "cscw18" == doc.name[:
                                      6]:  # replace the earlier 10% test documents as cscw'18 annotation guideline 10 examples
                test_sents.add(s)
            elif "cscw18" != doc.name[:6]:
                train_sents.add(s)

    General, general_extractor = util_module.get_segment_class_and_matcher(
        "General", ngrams)
    general_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        general_extractor,
        General,
        "General",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    input("Finished general ")

    # load segment_candidate_class and corresponding_matcher, e.g. (Background, non_comma_dict_background_matcher)
    Background, background_matcher = util_module.get_segment_class_and_matcher(
        "Background", ngrams)
    background_cands = extract_and_display(
        train_sents,
        dev_sents,
        test_sents,
        session,
        background_matcher,
        Background,
        "Background",
        train_doc_breakdown_map=train_doc_breakdown_map,
        test_doc_breakdown_map=test_doc_breakdown_map)

    debug_sess_eval(session, Background, background_matcher)
#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
# BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])

# BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
# BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium'])
# BiomarkerType = candidate_subclass('BiomarkerType', ['biomarker', 'typ3'])
# # BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit'])
#can eventually add MEASUREMENT and COHORT SIZE among other entities

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
# drug_ngrams = Ngrams(n_max=5)
# medium_ngrams = Ngrams(n_max=5)
# type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
# # level_ngrams = Ngrams(n_max=1)
# unit_ngrams = Ngrams(n_max=1)

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getConditionMatcher()
# dMatcher = matchers.getDrugMatcher()
# mMatcher = matchers.getMediumMatcher()
# tMatcher = matchers.getTypeMatcher()
# lMatcher = matchers.getLevelMatcher()
# uMatcher = matchers.getUnitMatcher()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:
            sentences.add(sentence)





from snorkel.models import candidate_subclass

Title = candidate_subclass('Person_Org', ['person1', 'organization'])

from snorkel.candidates import Ngrams

ngrams = Ngrams(n_max=3)

from snorkel.matchers import PersonMatcher

from snorkel.matchers import OrganizationMatcher

person_matcher = PersonMatcher(longest_match_only=True)

org_matcher = OrganizationMatcher(longest_match_only=True)

from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Title, [ngrams, ngrams], [person_matcher, org_matcher],
                        symmetric_relations=False, nested_relations=False, self_relations=False)
						
%time c = ce.extract(sentences, 'Emails Training Candidates', session)
def run(candidate1, candidate2, pairing_name, cand1_ngrams, cand2_ngrams,
        cand1Matcher, cand2Matcher, model_name, output_file_name,
        corpus_parser):
    print "Started"
    session = SnorkelSession()

    # The following line is for testing only. Feel free to ignore it.

    candidate_pair = candidate_subclass(pairing_name, [candidate1, candidate2])

    sentences = set()
    docs = session.query(Document).order_by(Document.name).all()
    for doc in docs:
        for s in doc.sentences:
            sentences.add(s)

    cand_1_ngrams = Ngrams(n_max=cand1_ngrams)
    # condition_ngrams = Ngrams(n_max=7)
    cand_2_ngrams = Ngrams(n_max=cand2_ngrams)
    # medium_ngrams = Ngrams(n_max=5)
    # type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
    # # level_ngrams = Ngrams(n_max=1)
    # unit_ngrams = Ngrams(n_max=1)

    # Construct our Matchers

    # cMatcher = matchers.getConditionMatcher()
    # mMatcher = matchers.getMediumMatcher()
    # tMatcher = matchers.getTypeMatcher()
    # lMatcher = matchers.getLevelMatcher()
    # uMatcher = matchers.getUnitMatcher()

    # Building the CandidateExtractors
    # candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
    candidate_extractor = CandidateExtractor(candidate_pair,
                                             [cand_1_ngrams, cand_2_ngrams],
                                             [cand1Matcher, cand2Matcher])
    # candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
    # candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
    # candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

    # List of Candidate Sets for each relation type: [train, dev, test]
    candidate_extractor.apply(sentences, split=4, clear=True)
    cands = session.query(candidate_pair).filter(
        candidate_pair.split == 4).order_by(candidate_pair.id).all()
    session.commit()
    # cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
    # cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
    # cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
    # cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)

    if (len(cands)) == 0:
        print "No Candidates Found"
        return
    if (pairing_name == 'BiomarkerCondition'):
        # session.rollback()
        # print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
        add_adj_candidate_BC(session, candidate_pair, cands, 4)
        # fix_specificity(session, BiomarkerCondition, cands_BC[1])
        # print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 4).count()
        session.commit()

    lstm = reRNN(seed=1701, n_threads=None)

    lstm.load(model_name)

    predictions = lstm.predictions(cands)
    output_file = open(output_file_name, 'wb')
    import csv
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(
        ['doc_id', 'sentence', candidate1, candidate2, 'prediction'])
    for i in range(len(cands)):
        doc_string = 'PMC' + str(cands[i].get_parent().get_parent())[9:]
        sentence_string = cands[i].get_parent().text
        cand_1_string = cands[i].get_contexts()[0].get_span()
        cand_2_string = cands[i].get_contexts()[1].get_span()
        prediction = predictions[i]
        csvWriter.writerow([
            unidecode(doc_string),
            unidecode(sentence_string),
            unidecode(cand_1_string),
            unidecode(cand_2_string), prediction
        ])
Beispiel #11
0
from snorkel import SnorkelSession
from snorkel.matchers import DictionaryMatch
from final_candidates import GM, PM
from snorkel.candidates import Ngrams, CandidateSpace, CandidateExtractor
from snorkel.models import Document, Sentence, candidate_subclass
from snorkel.viewer import SentenceNgramViewer

SPLIT_ON_DOCS = False
ALL_DOCS = True  # if true, create train dev and test. if false, push everything to dev cands.

session = SnorkelSession()

GenePhenoPair = candidate_subclass('GenePhenoPair2', ['gene', 'pheno'])

gene_ngrams = Ngrams(n_max=5)
pheno_ngrams = Ngrams(n_max=10)
cand_extractor = CandidateExtractor(GenePhenoPair, [gene_ngrams, pheno_ngrams],
                                    [GM, PM],
                                    symmetric_relations=True)

print "Splitting Docs..."
pathname = 'small_data/' if os.environ[
    'AGP_DATA_SIZE'] == 'small-data' else 'data/'
with open(pathname + 'pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(
    sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
# 40, 10, 10
train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
# In[12]:


from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class, LocationMatcher, fast_loc
from snorkel.matchers import Union, LambdaFunctionMatcher

# Setting extraction type -- should be a subfield in your data source extractions field!
extraction_type = 'location'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Defining ngrams for candidates
location_ngrams = Ngrams(n_max=3)

# Define matchers
geotext_location_matcher = LambdaFunctionMatcher(func=fast_loc)
spacy_location_matcher = LocationMatcher(longest_match_only=True)

# Union matchers and create candidate extractor
location_matcher = Union(geotext_location_matcher)
cand_extractor   = CandidateExtractor(candidate_class, [location_ngrams], [location_matcher])


# Applying candidate extractor to each split (train, dev, test)

# In[ ]: