Beispiel #1
0
def main(args):

    session = SnorkelSession()

    # ---------------------------------------
    # 1: Split into blocks
    # ---------------------------------------
    split_pubtator_corpus(args.input_file, split_size=args.split_size)

    # ---------------------------------------
    # 2: Parse documents
    # ---------------------------------------
    filelist = glob.glob("{}.splits_{}/*".format(args.input_file,
                                                 args.split_size))

    # Iterate through the splits
    start_ts = time()
    for fp in filelist:
        doc_preprocessor = PubTatorDocPreprocessor(fp)
        parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer()
        corpus_parser = CorpusParser(parser=parser)
        corpus_parser.apply(doc_preprocessor,
                            parallelism=args.num_procs,
                            clear=False)
        end_ts = time()
        print "Split completed in [%s]" % (time() - end_ts, )

    # pubtator_tags = PubTatorTagProcessor()
    # for fp in filelist:
    #     # load entity tags
    #     pubtator_tags.load_data(session, fp)

    print "\nDONE in [%s]" % (time() - start_ts, )
def main():

    from snorkel import SnorkelSession
    session = SnorkelSession()

    import os
    from snorkel.parser import XMLMultiDocPreprocessor

    # The following line is for testing only. Feel free to ignore it.
    file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml'

    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//document',
                                               text='.//passage/text/text()',
                                               id='.//id/text()')

    from snorkel.parser import CorpusParser
    from utils import TaggerOneTagger

    tagger_one = TaggerOneTagger()
    corpus_parser = CorpusParser(fn=tagger_one.tag)
    corpus_parser.apply(list(doc_preprocessor)[:100])
    # parsed result saved in session

    return doc_preprocessor, corpus_parser, session
def parse_corpus(to_process_file):
    file_path = to_process_file
    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//Article',
                                               text='./text/text()',
                                               id='./article-id/text()')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(list(doc_preprocessor))
    return corpus_parser
Beispiel #4
0
def doc_creation(df_features, session):
    # write the subset to a .csv and convert it to a .tsv file
    df_features.to_csv('dataset.csv', header=False)
    #
    csv.writer(open('dataset.tsv', 'w+'),
               delimiter='	').writerows(csv.reader(open("dataset.csv")))
    doc_preprocessor = TSVDocPreprocessor('dataset.tsv')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor)
    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())
Beispiel #5
0
def doc_parse(path):
    """
    Loads TSV file and parses to Snorkel Contexts
    :param path: Path to TSV file
    :return: None
    """
    try:
        doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500)

        corpus_parser = CorpusParser()
        corpus_parser.apply(doc_preprocessor)
        print("Documents:", session.query(Document).count())
        print("Sentences:", session.query(Sentence).count())

    except Exception:
        print('Error loading TSV file')
Beispiel #6
0
def docs_to_sentences():
	# Must set SNORKELDB before importing SnorkelSession
	from snorkel import SnorkelSession
	from snorkel.parser import TSVDocPreprocessor
	from snorkel.parser import CorpusParser
	from snorkel.models import Document, Sentence
	session = SnorkelSession()

	pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv'
	doc_preprocessor = TSVDocPreprocessor(pathname)

	corpus_parser = CorpusParser()
	corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count())

	print "Documents:", session.query(Document).count()
	print "Sentences:", session.query(Sentence).count()
Beispiel #7
0
def parse_wikipedia_dump(
        dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/',
        clear=False,
        parallelism=8):

    logging.info("Corpus parsing start")
    session = SnorkelSession()

    corpus_parser = CorpusParser(parser=Spacy())
    onlyfiles = [
        f for f in listdir(dumps_folder_path)
        if isfile(join(dumps_folder_path, f))
    ]

    i = 0
    for file in onlyfiles:
        if file.endswith(".xml"):
            print file
            doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path +
                                                       file,
                                                       doc='.//doc',
                                                       text='./text()',
                                                       id='./@title')
            if i > 0:
                clear = False
            try:
                corpus_parser.apply(doc_preprocessor,
                                    clear=clear,
                                    parallelism=parallelism)
            except IntegrityError as e:
                print("Already parsed " + file)
                logging.error("Already parsed " + file)
            i = i + 1
    #logging.debug("Documents: %d", session.query(Document).count())
    #logging.debug("Sentences: %d", session.query(Sentence).count())
    logging.info("Corpus parsing end")
Beispiel #8
0
for word in dont_want2:
    if word in virus_list:
        virus_list.remove(word)

# ------------------------------------------

# START SNORKEL SESSION

session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv',
                                      max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams],
                                    [virus_matcher, animals_matcher],
                                    nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords
Beispiel #9
0
# The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use.

# In[ ]:

working_path = os.environ['WORKINGPATH']
xml_parser = XMLMultiDocPreprocessor(path=working_path +
                                     '/Database/epilepsy_data.xml',
                                     doc='.//document',
                                     text='.//passage/text/text()',
                                     id='.//id/text()')

# In[ ]:

working_path = os.environ['WORKINGPATH']
dg_tagger = Tagger(working_path + "/Database/epilepsy_tags_shelve")
corpus_parser = CorpusParser(fn=dg_tagger.tag)
get_ipython().magic(u'time corpus_parser.apply(list(xml_parser))')

# In[ ]:

print "Documents: ", session.query(Document).count()
print "Sentences: ", session.query(Sentence).count()

# # Get each candidate relation

# This block of code below is designed to gather and tag each sentence found. **Note**: This does include the title of each abstract.

# In[ ]:

gene_df = pd.read_csv("epilepsy-genes.tsv", sep="\t")
Beispiel #10
0
#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)'
term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)'

# Doc length in characters, remove to have no max
max_doc_length = None

# Setting preprocessor
print(f'Preprocessing folder: {data_loc}')
doc_preprocessor = set_preprocessor(data_source,
                                    data_loc,
                                    max_docs=max_docs,
                                    verbose=False,
                                    clean_docs=False,
                                    content_fields=['raw_content', 'url'],
                                    term=term,
                                    max_doc_length=max_doc_length)

# Setting parser and applying corpus preprocessor
parser = SimpleTokenizer(delim='<|>')
corpus_parser = CorpusParser(parser=parser)
corpus_parser.apply(list(doc_preprocessor),
                    parallelism=parallelism,
                    verbose=False)

# Printing number of docs/sentences
print("==============================")
print(f"DB creation results for {postgres_db_name}:")
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print("==============================")
##### LIST OF LF FUNCTIONS TO CHECK


LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index]
#LFs=[LF_edit_index,LF_jackard_index]


##### snorkeling


session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):
Beispiel #12
0
# Refer to https://github.com/greenelab/pubtator for instructions
# to download and parse Pubtator
working_path = '/home/danich1/Documents/Database/pubmed_docs.xml'
xml_parser = XMLMultiDocPreprocessor(path=working_path,
                                     doc='.//document',
                                     text='.//passage/text/text()',
                                     id='.//id/text()',
                                     tag_filter=set(filter_df['pubmed_id']))

# In[ ]:

dg_tagger = Tagger(grouped)

# In[ ]:

corpus_parser = CorpusParser(fn=dg_tagger.tag)
document_chunk = []

for document in tqdm.tqdm(xml_parser.generate()):

    document_chunk.append(document)

    # chunk the data because snorkel cannot
    # scale properly
    if len(document_chunk) >= 5e4:
        corpus_parser.apply(document_chunk, parallelism=5, clear=False)
        document_chunk = []

# If generator exhausts and there are still
# document to parse
if len(document_chunk) > 0:
Beispiel #13
0
    id='.//front/article-meta/article-id/text()')

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                           doc='.//document',
                                           text='.//passage/text/text()',
                                           id='.//id/text()')

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                            doc='.//document',
                                            text='.//passage/text/text()',
                                            id='.//id/text()')

# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

# Assigning each sentence to {train,dev,test}-set based on Stable ID
Beispiel #14
0
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.parser import TSVDocParser
from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path='data/proteincorpus_sm.tsv')
from snorkel.parser import SentenceParser

sent_parser = SentenceParser()

from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, 'Protein Training')

for name, path in [('Protein Development', 'data/protein_dev.tsv'),
                   ('Protein Test', 'data/protein_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()


from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one()
corpus
LFs = [LF_political_title, LF_title_left_window, LF_title_right_window, LF_no_title_in_sentence]


from snorkel import SnorkelSession
session = SnorkelSession()
import os

from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path="data/clinton_train.tsv")

from snorkel.parser import SentenceParser

sent_parser = SentenceParser()
from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, "Emails Training")
session.add(corpus)
session.commit()


for name, path in [('Emails Development', 'data/clinton_dev.tsv'),
                   ('Emails Test', 'data/clinton_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:
        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db
session = SnorkelSession()
# Here, we just set a global variable related to automatic testing- you can safely ignore this!
max_docs = 50 if 'CI' in os.environ else float('inf')

# In[4]:

from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv',
                                      max_docs=max_docs)

# In[5]:

from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)')

# In[6]:

from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

dict_final = {}

crimetype_murder = [
    'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting',
    'convicted', 'murdered'
]