Esempio n. 1
0
def reload_external_labels(session: SnorkelSession,
                           input_file: Union[str, Path],
                           annotator_name: str = "gold"):
    Education = get_candidate_class()
    with open(str(input_file), "r") as f:
        lbls = ujson.load(f)

    for lbl in lbls:
        # we check if the label already exists, in case this cell was already executed
        context_stable_ids = "~~".join((lbl['person'], lbl['organization']))
        query = session.query(StableLabel).filter(
            StableLabel.context_stable_ids == context_stable_ids)
        query = query.filter(StableLabel.annotator_name == annotator_name)
        if query.count() == 0:
            session.add(
                StableLabel(context_stable_ids=context_stable_ids,
                            annotator_name=annotator_name,
                            value=lbl['value']))

    # commit session
    session.commit()

    # reload annotator labels
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=1,
                            filter_label_split=False)
    reload_annotator_labels(session,
                            Education,
                            annotator_name,
                            split=2,
                            filter_label_split=False)
Esempio n. 2
0
ce = CandidateExtractor(entity, [ngrams, ngrams], [dict_proteins, dict_proteins],
                        symmetric_relations=False, nested_relations=False, self_relations=False)

%time c = ce.extract(sentences, 'Protein1 Training Candidates', session)



for corpus_name in ['Protein Development']:
    corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    sentences = set()
    for document in corpus:
        for sentence in document.sentences:
            sentences.add(sentence)
    
    %time c = ce.extract(sentences, 'Protein1 Development Candidates', session)
    session.add(c)
session.commit()


%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import CandidateSet
from snorkel.models import candidate_subclass
#entity = candidate_subclass('entity', ['entity1', 'entity2'])

train = session.query(CandidateSet).filter(CandidateSet.name == 'Protein1 Training Candidates').one()

Esempio n. 3
0
for i in range(1, n + 1):
    cand_list = count_dict[i]
    random.shuffle(cand_list)
    #take 10 percent from here
    train_cand_list += cand_list[0:int(len(cand_list) * 0.4)]

print(" -number of pairs:", len(cand_dict))
print(" -number of signals:", n)
print(" -number of pair to train GEN model", len(train_cand_list))

for i, cand in enumerate(cand_list):
    split = 0 if cand in train_cand_list else 1

    raw_text = RawText(stable_id=cand, name=cand, text=cand)
    tweet = Tweet(tweet=raw_text, split=split)
    session.add(tweet)

session.commit()

print("Commit to snorkel database done...")


#writing label generator
def worker_label_generator(t):
    for worker_id in cand_dict[t.tweet.stable_id]:
        yield worker_id, cand_dict[t.tweet.stable_id][worker_id]


np.random.seed(1701)
labeler = LabelAnnotator(label_generator=worker_label_generator)
L_train = labeler.apply(split=0)
Esempio n. 4
0
from snorkel import SnorkelSession
session = SnorkelSession()
import os

from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path="data/clinton_train.tsv")

from snorkel.parser import SentenceParser

sent_parser = SentenceParser()
from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, "Emails Training")
session.add(corpus)
session.commit()


for name, path in [('Emails Development', 'data/clinton_dev.tsv'),
                   ('Emails Test', 'data/clinton_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:
            sentences.add(sentence)