def reload_external_labels(session: SnorkelSession, input_file: Union[str, Path], annotator_name: str = "gold"): Education = get_candidate_class() with open(str(input_file), "r") as f: lbls = ujson.load(f) for lbl in lbls: # we check if the label already exists, in case this cell was already executed context_stable_ids = "~~".join((lbl['person'], lbl['organization'])) query = session.query(StableLabel).filter( StableLabel.context_stable_ids == context_stable_ids) query = query.filter(StableLabel.annotator_name == annotator_name) if query.count() == 0: session.add( StableLabel(context_stable_ids=context_stable_ids, annotator_name=annotator_name, value=lbl['value'])) # commit session session.commit() # reload annotator labels reload_annotator_labels(session, Education, annotator_name, split=1, filter_label_split=False) reload_annotator_labels(session, Education, annotator_name, split=2, filter_label_split=False)
ce = CandidateExtractor(entity, [ngrams, ngrams], [dict_proteins, dict_proteins], symmetric_relations=False, nested_relations=False, self_relations=False) %time c = ce.extract(sentences, 'Protein1 Training Candidates', session) for corpus_name in ['Protein Development']: corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one() sentences = set() for document in corpus: for sentence in document.sentences: sentences.add(sentence) %time c = ce.extract(sentences, 'Protein1 Development Candidates', session) session.add(c) session.commit() %load_ext autoreload %autoreload 2 from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import CandidateSet from snorkel.models import candidate_subclass #entity = candidate_subclass('entity', ['entity1', 'entity2']) train = session.query(CandidateSet).filter(CandidateSet.name == 'Protein1 Training Candidates').one()
for i in range(1, n + 1): cand_list = count_dict[i] random.shuffle(cand_list) #take 10 percent from here train_cand_list += cand_list[0:int(len(cand_list) * 0.4)] print(" -number of pairs:", len(cand_dict)) print(" -number of signals:", n) print(" -number of pair to train GEN model", len(train_cand_list)) for i, cand in enumerate(cand_list): split = 0 if cand in train_cand_list else 1 raw_text = RawText(stable_id=cand, name=cand, text=cand) tweet = Tweet(tweet=raw_text, split=split) session.add(tweet) session.commit() print("Commit to snorkel database done...") #writing label generator def worker_label_generator(t): for worker_id in cand_dict[t.tweet.stable_id]: yield worker_id, cand_dict[t.tweet.stable_id][worker_id] np.random.seed(1701) labeler = LabelAnnotator(label_generator=worker_label_generator) L_train = labeler.apply(split=0)
from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import TSVDocParser doc_parser = TSVDocParser(path="data/clinton_train.tsv") from snorkel.parser import SentenceParser sent_parser = SentenceParser() from snorkel.parser import CorpusParser cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, "Emails Training") session.add(corpus) session.commit() for name, path in [('Emails Development', 'data/clinton_dev.tsv'), ('Emails Test', 'data/clinton_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence)