%autoreload 2 %matplotlib inline from snorkel import SnorkelSession session = SnorkelSession() from snorkel.parser import TSVDocParser from snorkel.parser import TSVDocParser doc_parser = TSVDocParser(path='data/proteincorpus_sm.tsv') from snorkel.parser import SentenceParser sent_parser = SentenceParser() from snorkel.parser import CorpusParser cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, 'Protein Training') for name, path in [('Protein Development', 'data/protein_dev.tsv'), ('Protein Test', 'data/protein_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import Corpus corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one() corpus
from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import TSVDocParser doc_parser = TSVDocParser(path="data/clinton_train.tsv") from snorkel.parser import SentenceParser sent_parser = SentenceParser() from snorkel.parser import CorpusParser cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, "Emails Training") session.add(corpus) session.commit() for name, path in [('Emails Development', 'data/clinton_dev.tsv'), ('Emails Test', 'data/clinton_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5: sentences.add(sentence)