def main(args): session = SnorkelSession() # --------------------------------------- # 1: Split into blocks # --------------------------------------- split_pubtator_corpus(args.input_file, split_size=args.split_size) # --------------------------------------- # 2: Parse documents # --------------------------------------- filelist = glob.glob("{}.splits_{}/*".format(args.input_file, args.split_size)) # Iterate through the splits start_ts = time() for fp in filelist: doc_preprocessor = PubTatorDocPreprocessor(fp) parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer() corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(doc_preprocessor, parallelism=args.num_procs, clear=False) end_ts = time() print "Split completed in [%s]" % (time() - end_ts, ) # pubtator_tags = PubTatorTagProcessor() # for fp in filelist: # # load entity tags # pubtator_tags.load_data(session, fp) print "\nDONE in [%s]" % (time() - start_ts, )
def main(): from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import XMLMultiDocPreprocessor # The following line is for testing only. Feel free to ignore it. file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml' doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') from snorkel.parser import CorpusParser from utils import TaggerOneTagger tagger_one = TaggerOneTagger() corpus_parser = CorpusParser(fn=tagger_one.tag) corpus_parser.apply(list(doc_preprocessor)[:100]) # parsed result saved in session return doc_preprocessor, corpus_parser, session
def parse_corpus(to_process_file): file_path = to_process_file doc_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//Article', text='./text/text()', id='./article-id/text()') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(list(doc_preprocessor)) return corpus_parser
def doc_creation(df_features, session): # write the subset to a .csv and convert it to a .tsv file df_features.to_csv('dataset.csv', header=False) # csv.writer(open('dataset.tsv', 'w+'), delimiter=' ').writerows(csv.reader(open("dataset.csv"))) doc_preprocessor = TSVDocPreprocessor('dataset.tsv') corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count())
def doc_parse(path): """ Loads TSV file and parses to Snorkel Contexts :param path: Path to TSV file :return: None """ try: doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor) print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) except Exception: print('Error loading TSV file')
def docs_to_sentences(): # Must set SNORKELDB before importing SnorkelSession from snorkel import SnorkelSession from snorkel.parser import TSVDocPreprocessor from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence session = SnorkelSession() pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv' doc_preprocessor = TSVDocPreprocessor(pathname) corpus_parser = CorpusParser() corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count()) print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count()
def parse_wikipedia_dump( dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/', clear=False, parallelism=8): logging.info("Corpus parsing start") session = SnorkelSession() corpus_parser = CorpusParser(parser=Spacy()) onlyfiles = [ f for f in listdir(dumps_folder_path) if isfile(join(dumps_folder_path, f)) ] i = 0 for file in onlyfiles: if file.endswith(".xml"): print file doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path + file, doc='.//doc', text='./text()', id='./@title') if i > 0: clear = False try: corpus_parser.apply(doc_preprocessor, clear=clear, parallelism=parallelism) except IntegrityError as e: print("Already parsed " + file) logging.error("Already parsed " + file) i = i + 1 #logging.debug("Documents: %d", session.query(Document).count()) #logging.debug("Sentences: %d", session.query(Sentence).count()) logging.info("Corpus parsing end")
for word in dont_want2: if word in virus_list: virus_list.remove(word) # ------------------------------------------ # START SNORKEL SESSION session = SnorkelSession() n_docs = 500 doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv', max_docs=n_docs) # new files (88 papers) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) VirusHost = candidate_subclass('VirusHost', ['virus', 'host']) ngrams = Ngrams(n_max=10) virus_matcher = DictionaryMatch(d=virus_list) animals_matcher = DictionaryMatch(d=animals_list) cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], nested_relations=True) docs = session.query(Document).order_by(Document.name).all() # Text Pattern based labeling functions, which look for certain keywords
# The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use. # In[ ]: working_path = os.environ['WORKINGPATH'] xml_parser = XMLMultiDocPreprocessor(path=working_path + '/Database/epilepsy_data.xml', doc='.//document', text='.//passage/text/text()', id='.//id/text()') # In[ ]: working_path = os.environ['WORKINGPATH'] dg_tagger = Tagger(working_path + "/Database/epilepsy_tags_shelve") corpus_parser = CorpusParser(fn=dg_tagger.tag) get_ipython().magic(u'time corpus_parser.apply(list(xml_parser))') # In[ ]: print "Documents: ", session.query(Document).count() print "Sentences: ", session.query(Sentence).count() # # Get each candidate relation # This block of code below is designed to gather and tag each sentence found. **Note**: This does include the title of each abstract. # In[ ]: gene_df = pd.read_csv("epilepsy-genes.tsv", sep="\t")
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)' term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)' # Doc length in characters, remove to have no max max_doc_length = None # Setting preprocessor print(f'Preprocessing folder: {data_loc}') doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=False, clean_docs=False, content_fields=['raw_content', 'url'], term=term, max_doc_length=max_doc_length) # Setting parser and applying corpus preprocessor parser = SimpleTokenizer(delim='<|>') corpus_parser = CorpusParser(parser=parser) corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism, verbose=False) # Printing number of docs/sentences print("==============================") print(f"DB creation results for {postgres_db_name}:") print("Documents:", session.query(Document).count()) print("Sentences:", session.query(Sentence).count()) print("==============================")
##### LIST OF LF FUNCTIONS TO CHECK LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index] #LFs=[LF_edit_index,LF_jackard_index] ##### snorkeling session = SnorkelSession() doc_preprocessor = TSVDocPreprocessor(path) corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor) pairs = candidate_subclass('pairs1', ['queryPair']) regexpmatch=RegexMatchSpan(rgx=".*") cs=queryCandidate() cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch]) docs = session.query(Document).order_by(Document.name).all() sentences = session.query(Sentence).all() #print(sentences) sents=set(); for i,doc in enumerate(docs):
# Refer to https://github.com/greenelab/pubtator for instructions # to download and parse Pubtator working_path = '/home/danich1/Documents/Database/pubmed_docs.xml' xml_parser = XMLMultiDocPreprocessor(path=working_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()', tag_filter=set(filter_df['pubmed_id'])) # In[ ]: dg_tagger = Tagger(grouped) # In[ ]: corpus_parser = CorpusParser(fn=dg_tagger.tag) document_chunk = [] for document in tqdm.tqdm(xml_parser.generate()): document_chunk.append(document) # chunk the data because snorkel cannot # scale properly if len(document_chunk) >= 5e4: corpus_parser.apply(document_chunk, parallelism=5, clear=False) document_chunk = [] # If generator exhausts and there are still # document to parse if len(document_chunk) > 0:
id='.//front/article-meta/article-id/text()') file_path = 'articles/development.xml' dev_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') file_path = 'articles/testcorpus.xml' test_preprocessor = XMLMultiDocPreprocessor(path=file_path, doc='.//document', text='.//passage/text/text()', id='.//id/text()') # Parsing corpus_parser = CorpusParser() # Note: Parallelism can be run with a Postgres DBMS, but not SQLite corpus_parser.apply(list(train_preprocessor)) corpus_parser.apply(list(dev_preprocessor), clear=False) corpus_parser.apply(list(test_preprocessor), clear=False) # Retrieving Stable IDs for each of the candidate sentences with open('articles/doc_ids.pkl', 'rb') as f: train_ids, dev_ids, test_ids = load(f) train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids) train_sents, dev_sents, test_sents = set(), set(), set() docs = session.query(Document).order_by(Document.name).all() # Assigning each sentence to {train,dev,test}-set based on Stable ID
%load_ext autoreload %autoreload 2 %matplotlib inline from snorkel import SnorkelSession session = SnorkelSession() from snorkel.parser import TSVDocParser from snorkel.parser import TSVDocParser doc_parser = TSVDocParser(path='data/proteincorpus_sm.tsv') from snorkel.parser import SentenceParser sent_parser = SentenceParser() from snorkel.parser import CorpusParser cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, 'Protein Training') for name, path in [('Protein Development', 'data/protein_dev.tsv'), ('Protein Test', 'data/protein_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() from snorkel import SnorkelSession session = SnorkelSession() from snorkel.models import Corpus corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one() corpus
LFs = [LF_political_title, LF_title_left_window, LF_title_right_window, LF_no_title_in_sentence] from snorkel import SnorkelSession session = SnorkelSession() import os from snorkel.parser import TSVDocParser doc_parser = TSVDocParser(path="data/clinton_train.tsv") from snorkel.parser import SentenceParser sent_parser = SentenceParser() from snorkel.parser import CorpusParser cp = CorpusParser(doc_parser, sent_parser) %time corpus = cp.parse_corpus(session, "Emails Training") session.add(corpus) session.commit() for name, path in [('Emails Development', 'data/clinton_dev.tsv'), ('Emails Test', 'data/clinton_test.tsv')]: doc_parser.path=path %time corpus = cp.parse_corpus(session, name) session.commit() sentences = set() for document in corpus: for sentence in document.sentences: if number_of_people(sentence) < 5:
if line.split("\t")[0] in docID or len(line.split("\t")) != 2: continue docID.add(line.split("\t")[0]) fout.write(line.replace("\n", " ").strip() + "\n") print("total docID count", len(docID)) doc_preprocessor = TSVDocPreprocessor(newfile, encoding="utf-8", max_docs=n_docs) from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence # defined in context.py file if session.query(Document).count() == 0: corpus_parser = CorpusParser(parser=Spacy()) corpus_parser.apply(doc_preprocessor, count=n_docs) # ,parallelism=5) print("Documents:", session.query(Document).count()) from snorkel import SnorkelSession from snorkel.parser.spacy_parser import Spacy from snorkel.parser import CorpusParser from snorkel.models import Document, Sentence from collections import defaultdict import numpy as np session = SnorkelSession() docs = session.query(Document).all() sents = session.query(Sentence).all() # get all sentences from snorkel.db
session = SnorkelSession() # Here, we just set a global variable related to automatic testing- you can safely ignore this! max_docs = 50 if 'CI' in os.environ else float('inf') # In[4]: from snorkel.parser import TSVDocPreprocessor doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv', max_docs=max_docs) # In[5]: from snorkel.parser import CorpusParser corpus_parser = CorpusParser() get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)') # In[6]: from snorkel.models import Document, Sentence print "Documents:", session.query(Document).count() print "Sentences:", session.query(Sentence).count() dict_final = {} crimetype_murder = [ 'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting', 'convicted', 'murdered' ]