def main():

    from snorkel import SnorkelSession
    session = SnorkelSession()

    import os
    from snorkel.parser import XMLMultiDocPreprocessor

    # The following line is for testing only. Feel free to ignore it.
    file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml'

    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//document',
                                               text='.//passage/text/text()',
                                               id='.//id/text()')

    from snorkel.parser import CorpusParser
    from utils import TaggerOneTagger

    tagger_one = TaggerOneTagger()
    corpus_parser = CorpusParser(fn=tagger_one.tag)
    corpus_parser.apply(list(doc_preprocessor)[:100])
    # parsed result saved in session

    return doc_preprocessor, corpus_parser, session
Example #2
0
def main(args):

    session = SnorkelSession()

    # ---------------------------------------
    # 1: Split into blocks
    # ---------------------------------------
    split_pubtator_corpus(args.input_file, split_size=args.split_size)

    # ---------------------------------------
    # 2: Parse documents
    # ---------------------------------------
    filelist = glob.glob("{}.splits_{}/*".format(args.input_file,
                                                 args.split_size))

    # Iterate through the splits
    start_ts = time()
    for fp in filelist:
        doc_preprocessor = PubTatorDocPreprocessor(fp)
        parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer()
        corpus_parser = CorpusParser(parser=parser)
        corpus_parser.apply(doc_preprocessor,
                            parallelism=args.num_procs,
                            clear=False)
        end_ts = time()
        print "Split completed in [%s]" % (time() - end_ts, )

    # pubtator_tags = PubTatorTagProcessor()
    # for fp in filelist:
    #     # load entity tags
    #     pubtator_tags.load_data(session, fp)

    print "\nDONE in [%s]" % (time() - start_ts, )
def parse_corpus(to_process_file):
    file_path = to_process_file
    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//Article',
                                               text='./text/text()',
                                               id='./article-id/text()')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(list(doc_preprocessor))
    return corpus_parser
Example #4
0
def doc_creation(df_features, session):
    # write the subset to a .csv and convert it to a .tsv file
    df_features.to_csv('dataset.csv', header=False)
    #
    csv.writer(open('dataset.tsv', 'w+'),
               delimiter='	').writerows(csv.reader(open("dataset.csv")))
    doc_preprocessor = TSVDocPreprocessor('dataset.tsv')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor)
    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())
Example #5
0
def docs_to_sentences():
	# Must set SNORKELDB before importing SnorkelSession
	from snorkel import SnorkelSession
	from snorkel.parser import TSVDocPreprocessor
	from snorkel.parser import CorpusParser
	from snorkel.models import Document, Sentence
	session = SnorkelSession()

	pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv'
	doc_preprocessor = TSVDocPreprocessor(pathname)

	corpus_parser = CorpusParser()
	corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count())

	print "Documents:", session.query(Document).count()
	print "Sentences:", session.query(Sentence).count()
Example #6
0
def doc_parse(path):
    """
    Loads TSV file and parses to Snorkel Contexts
    :param path: Path to TSV file
    :return: None
    """
    try:
        doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500)

        corpus_parser = CorpusParser()
        corpus_parser.apply(doc_preprocessor)
        print("Documents:", session.query(Document).count())
        print("Sentences:", session.query(Sentence).count())

    except Exception:
        print('Error loading TSV file')
Example #7
0
def parse_wikipedia_dump(
        dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/',
        clear=False,
        parallelism=8):

    logging.info("Corpus parsing start")
    session = SnorkelSession()

    corpus_parser = CorpusParser(parser=Spacy())
    onlyfiles = [
        f for f in listdir(dumps_folder_path)
        if isfile(join(dumps_folder_path, f))
    ]

    i = 0
    for file in onlyfiles:
        if file.endswith(".xml"):
            print file
            doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path +
                                                       file,
                                                       doc='.//doc',
                                                       text='./text()',
                                                       id='./@title')
            if i > 0:
                clear = False
            try:
                corpus_parser.apply(doc_preprocessor,
                                    clear=clear,
                                    parallelism=parallelism)
            except IntegrityError as e:
                print("Already parsed " + file)
                logging.error("Already parsed " + file)
            i = i + 1
    #logging.debug("Documents: %d", session.query(Document).count())
    #logging.debug("Sentences: %d", session.query(Sentence).count())
    logging.info("Corpus parsing end")
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db

docs_per_bucket = 150
Example #9
0
for word in dont_want2:
    if word in virus_list:
        virus_list.remove(word)

# ------------------------------------------

# START SNORKEL SESSION

session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv',
                                      max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams],
                                    [virus_matcher, animals_matcher],
                                    nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords

Example #10
0
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)'
term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)'

# Doc length in characters, remove to have no max
max_doc_length = None

# Setting preprocessor
print(f'Preprocessing folder: {data_loc}')
doc_preprocessor = set_preprocessor(data_source,
                                    data_loc,
                                    max_docs=max_docs,
                                    verbose=False,
                                    clean_docs=False,
                                    content_fields=['raw_content', 'url'],
                                    term=term,
                                    max_doc_length=max_doc_length)

# Setting parser and applying corpus preprocessor
parser = SimpleTokenizer(delim='<|>')
corpus_parser = CorpusParser(parser=parser)
corpus_parser.apply(list(doc_preprocessor),
                    parallelism=parallelism,
                    verbose=False)

# Printing number of docs/sentences
print("==============================")
print(f"DB creation results for {postgres_db_name}:")
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print("==============================")
##### LIST OF LF FUNCTIONS TO CHECK


LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index]
#LFs=[LF_edit_index,LF_jackard_index]


##### snorkeling


session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
    for s in doc.sentences:
Example #12
0
dg_tagger = Tagger(grouped)

# In[ ]:

corpus_parser = CorpusParser(fn=dg_tagger.tag)
document_chunk = []

for document in tqdm.tqdm(xml_parser.generate()):

    document_chunk.append(document)

    # chunk the data because snorkel cannot
    # scale properly
    if len(document_chunk) >= 5e4:
        corpus_parser.apply(document_chunk, parallelism=5, clear=False)
        document_chunk = []

# If generator exhausts and there are still
# document to parse
if len(document_chunk) > 0:
    corpus_parser.apply(data, parallelism=5, clear=False)
    document_chunk = []

# # Get each candidate relation

# After parsing the above abstracts, the next step in this pipeline is to extract candidates from all the tagged sentences. A candidate is considered a candidate if two mentions occur in the same sentence. For this pilot study, we are only considering the follow candidate relationships: Disease-Gene, Gene-Gene, Compound-Gene, Compound-Disease. In conjunction with extracting candidates, this part of the pipeline also stratifies each sentence into three different categories: Train (70%), Dev (20%), and Test (10%). These set categories will be used in subsequent notebooks ([3](3.data-gen-model.ipynb), [4](4.data-disc-model.ipynb), [5](5.data-analysis.ipynb)) for training and testing the machine learning algorithms.

# In[ ]:

chunk_size = 2e5
Example #13
0
dev_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                           doc='.//document',
                                           text='.//passage/text/text()',
                                           id='.//id/text()')

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                            doc='.//document',
                                            text='.//passage/text/text()',
                                            id='.//id/text()')

# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids: