def doc_creation(df_features, session): # write the subset to a .csv and convert it to a .tsv file df_features.to_csv('dataset.csv', header=False) # csv.writer(open('dataset.tsv', 'w+'), delimiter=' ').writerows(csv.reader(open("dataset.csv"))) doc_preprocessor = TSVDocPreprocessor('dataset.tsv') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count())
def docs_to_sentences(): # Must set SNORKELDB before importing SnorkelSession from snorkel import SnorkelSession from snorkel.parser import TSVDocPreprocessor from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence session = SnorkelSession() pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv' doc_preprocessor = TSVDocPreprocessor(pathname) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count()) print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count()
def doc_parse(path): """ Loads TSV file and parses to Snorkel Contexts :param path: Path to TSV file :return: None """ try: doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) except Exception: print('Error loading TSV file')
'SNORKELHOME'] + "../semantic/hard-drive-slim-top-tier-citation-" + str( train_bucket) docID = set() newfile = os.environ[ 'SNORKELHOME'] + "../semantic/hard-drive-slim-top-tier-citation-" + str( train_bucket) + "-new" with open(newfile, "w") as fout: for line in open(filename, "r", errors='ignore').readlines(): if line.split("\t")[0] in docID or len(line.split("\t")) != 2: continue docID.add(line.split("\t")[0]) fout.write(line.replace("\n", " ").strip() + "\n") print("total docID count", len(docID)) doc_preprocessor = TSVDocPreprocessor(newfile, encoding="utf-8", max_docs=n_docs) from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence # defined in context.py file if session.query(Document).count() == 0: corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) # ,parallelism=5) print("Documents:", session.query(Document).count()) from snorkel import SnorkelSession from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser
'Cherry', 'cherry', 'ihc', 'IHC', '(IHC)', 'hit' ] for word in dont_want2: if word in virus_list: virus_list.remove(word) # ------------------------------------------ # START SNORKEL SESSION session = SnorkelSession() n_docs = 500 doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords
##### LIST OF LF FUNCTIONS TO CHECK LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index] #LFs=[LF_edit_index,LF_jackard_index] ##### snorkeling session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences)
from snorkel.lf_helpers import ( get_tagged_text, rule_regex_search_tagged_text, rule_regex_search_btw_AB, rule_regex_search_btw_BA, rule_regex_search_before_A, rule_regex_search_before_B, ) # A ContextSpace defines the "space" of all candidates we even potentially consider; in this case we use the Ngrams subclass, and look for all n-grams up to 7 words long session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor( '/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv', max_docs=350) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) Sensitive = candidate_subclass('Sensitive', ['sensitive'], values=[ 'person', 'job', 'event', 'place', 'date', 'time', 'product', 'email', 'phone', 'quantity', 'address', 'url', 'org', 'file', 'password', False ]) # generating candidates. ngrams = Ngrams(n_max=6) ngramMatcher = NgramMatcher(longest_match_only=False)
import os # TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE # Note that this is necessary for parallel execution amongst other things... # os.environ['SNORKELDB'] = 'postgres:///snorkel-intro' from snorkel import SnorkelSession session = SnorkelSession() # Here, we just set a global variable related to automatic testing- you can safely ignore this! max_docs = 50 if 'CI' in os.environ else float('inf') # In[4]: from snorkel.parser import TSVDocPreprocessor doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv', max_docs=max_docs) # In[5]: from snorkel.parser import CorpusParser corpus_parser = CorpusParser() get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)') # In[6]: from snorkel.models import Document, Sentence print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count()