Python CorpusParserの例、snorkel.parser.CorpusParser Pythonの例

コード例 #1

0

ファイルを表示

def main(args):

    session = SnorkelSession()

    # ---------------------------------------
    # 1: Split into blocks
    # ---------------------------------------
    split_pubtator_corpus(args.input_file, split_size=args.split_size)

    # ---------------------------------------
    # 2: Parse documents
    # ---------------------------------------
    filelist = glob.glob("{}.splits_{}/*".format(args.input_file,
                                                 args.split_size))

    # Iterate through the splits
    start_ts = time()
    for fp in filelist:
        doc_preprocessor = PubTatorDocPreprocessor(fp)
        parser = Spacy() if args.parser == "spacy" else StanfordCoreNLPServer()
        corpus_parser = CorpusParser(parser=parser)
        corpus_parser.apply(doc_preprocessor,
                            parallelism=args.num_procs,
                            clear=False)
        end_ts = time()
        print "Split completed in [%s]" % (time() - end_ts, )

    # pubtator_tags = PubTatorTagProcessor()
    # for fp in filelist:
    #     # load entity tags
    #     pubtator_tags.load_data(session, fp)

    print "\nDONE in [%s]" % (time() - start_ts, )

コード例 #2

0

ファイルを表示

ファイル: test_cdr.py プロジェクト: anarkia7115/snorkel-experiment

def main():

    from snorkel import SnorkelSession
    session = SnorkelSession()

    import os
    from snorkel.parser import XMLMultiDocPreprocessor

    # The following line is for testing only. Feel free to ignore it.
    file_path = 'data/CDR.BioC.small.xml' if 'CI' in os.environ else 'data/CDR.BioC.xml'

    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//document',
                                               text='.//passage/text/text()',
                                               id='.//id/text()')

    from snorkel.parser import CorpusParser
    from utils import TaggerOneTagger

    tagger_one = TaggerOneTagger()
    corpus_parser = CorpusParser(fn=tagger_one.tag)
    corpus_parser.apply(list(doc_preprocessor)[:100])
    # parsed result saved in session

    return doc_preprocessor, corpus_parser, session

コード例 #3

0

ファイルを表示

ファイル: pipeline.py プロジェクト: varun-tandon/MarkerVilleBackendAlpha

def parse_corpus(to_process_file):
    file_path = to_process_file
    doc_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                               doc='.//Article',
                                               text='./text/text()',
                                               id='./article-id/text()')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(list(doc_preprocessor))
    return corpus_parser

コード例 #4

0

ファイルを表示

def doc_creation(df_features, session):
    # write the subset to a .csv and convert it to a .tsv file
    df_features.to_csv('dataset.csv', header=False)
    #
    csv.writer(open('dataset.tsv', 'w+'),
               delimiter='	').writerows(csv.reader(open("dataset.csv")))
    doc_preprocessor = TSVDocPreprocessor('dataset.tsv')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor)
    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())

コード例 #5

0

ファイルを表示

def doc_parse(path):
    """
    Loads TSV file and parses to Snorkel Contexts
    :param path: Path to TSV file
    :return: None
    """
    try:
        doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500)

        corpus_parser = CorpusParser()
        corpus_parser.apply(doc_preprocessor)
        print("Documents:", session.query(Document).count())
        print("Sentences:", session.query(Sentence).count())

    except Exception:
        print('Error loading TSV file')

コード例 #6

0

ファイルを表示

def docs_to_sentences():
	# Must set SNORKELDB before importing SnorkelSession
	from snorkel import SnorkelSession
	from snorkel.parser import TSVDocPreprocessor
	from snorkel.parser import CorpusParser
	from snorkel.models import Document, Sentence
	session = SnorkelSession()

	pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv'
	doc_preprocessor = TSVDocPreprocessor(pathname)

	corpus_parser = CorpusParser()
	corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count())

	print "Documents:", session.query(Document).count()
	print "Sentences:", session.query(Sentence).count()

コード例 #7

0

ファイルを表示

def parse_wikipedia_dump(
        dumps_folder_path='../../data/wikipedia/dump/en/extracted_text/AA/',
        clear=False,
        parallelism=8):

    logging.info("Corpus parsing start")
    session = SnorkelSession()

    corpus_parser = CorpusParser(parser=Spacy())
    onlyfiles = [
        f for f in listdir(dumps_folder_path)
        if isfile(join(dumps_folder_path, f))
    ]

    i = 0
    for file in onlyfiles:
        if file.endswith(".xml"):
            print file
            doc_preprocessor = XMLMultiDocPreprocessor(path=dumps_folder_path +
                                                       file,
                                                       doc='.//doc',
                                                       text='./text()',
                                                       id='./@title')
            if i > 0:
                clear = False
            try:
                corpus_parser.apply(doc_preprocessor,
                                    clear=clear,
                                    parallelism=parallelism)
            except IntegrityError as e:
                print("Already parsed " + file)
                logging.error("Already parsed " + file)
            i = i + 1
    #logging.debug("Documents: %d", session.query(Document).count())
    #logging.debug("Sentences: %d", session.query(Sentence).count())
    logging.info("Corpus parsing end")

コード例 #8

0

ファイルを表示

for word in dont_want2:
    if word in virus_list:
        virus_list.remove(word)

# ------------------------------------------

# START SNORKEL SESSION

session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv',
                                      max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams],
                                    [virus_matcher, animals_matcher],
                                    nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords

コード例 #9

0

ファイルを表示

# The code below is designed to read and parse data gathered from pubtator. Pubtator outputs their annotated text in xml format, so that is the standard file format we are going to use.

# In[ ]:

working_path = os.environ['WORKINGPATH']
xml_parser = XMLMultiDocPreprocessor(path=working_path +
                                     '/Database/epilepsy_data.xml',
                                     doc='.//document',
                                     text='.//passage/text/text()',
                                     id='.//id/text()')

# In[ ]:

working_path = os.environ['WORKINGPATH']
dg_tagger = Tagger(working_path + "/Database/epilepsy_tags_shelve")
corpus_parser = CorpusParser(fn=dg_tagger.tag)
get_ipython().magic(u'time corpus_parser.apply(list(xml_parser))')

# In[ ]:

print "Documents: ", session.query(Document).count()
print "Sentences: ", session.query(Sentence).count()

# # Get each candidate relation

# This block of code below is designed to gather and tag each sentence found. **Note**: This does include the title of each abstract.

# In[ ]:

gene_df = pd.read_csv("epilepsy-genes.tsv", sep="\t")

コード例 #10

0

ファイルを表示

#term = r'(\$?\d\d\d?.*?per|\$?\d\d\d?.*?hours?|\$?\d\d\d?.*?half|\$?\d\d\d?.*?minutes?)'
term = r'([Ll]ocation:.{0,100}|[cC]ity:.{0,100}|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)'

# Doc length in characters, remove to have no max
max_doc_length = None

# Setting preprocessor
print(f'Preprocessing folder: {data_loc}')
doc_preprocessor = set_preprocessor(data_source,
                                    data_loc,
                                    max_docs=max_docs,
                                    verbose=False,
                                    clean_docs=False,
                                    content_fields=['raw_content', 'url'],
                                    term=term,
                                    max_doc_length=max_doc_length)

# Setting parser and applying corpus preprocessor
parser = SimpleTokenizer(delim='<|>')
corpus_parser = CorpusParser(parser=parser)
corpus_parser.apply(list(doc_preprocessor),
                    parallelism=parallelism,
                    verbose=False)

# Printing number of docs/sentences
print("==============================")
print(f"DB creation results for {postgres_db_name}:")
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print("==============================")

コード例 #11

0

ファイルを表示

ファイル: snorkelDetectsCuts.py プロジェクト: patrickmarcel/SQLWL-segmentation

##### LIST OF LF FUNCTIONS TO CHECK


LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index]
#LFs=[LF_edit_index,LF_jackard_index]


##### snorkeling


session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)

sents=set();
for i,doc in enumerate(docs):

コード例 #12

0

ファイルを表示

# Refer to https://github.com/greenelab/pubtator for instructions
# to download and parse Pubtator
working_path = '/home/danich1/Documents/Database/pubmed_docs.xml'
xml_parser = XMLMultiDocPreprocessor(path=working_path,
                                     doc='.//document',
                                     text='.//passage/text/text()',
                                     id='.//id/text()',
                                     tag_filter=set(filter_df['pubmed_id']))

# In[ ]:

dg_tagger = Tagger(grouped)

# In[ ]:

corpus_parser = CorpusParser(fn=dg_tagger.tag)
document_chunk = []

for document in tqdm.tqdm(xml_parser.generate()):

    document_chunk.append(document)

    # chunk the data because snorkel cannot
    # scale properly
    if len(document_chunk) >= 5e4:
        corpus_parser.apply(document_chunk, parallelism=5, clear=False)
        document_chunk = []

# If generator exhausts and there are still
# document to parse
if len(document_chunk) > 0:

コード例 #13

0

ファイルを表示

ファイル: main.py プロジェクト: gmachiraju/markerville-backend

    id='.//front/article-meta/article-id/text()')

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                           doc='.//document',
                                           text='.//passage/text/text()',
                                           id='.//id/text()')

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(path=file_path,
                                            doc='.//document',
                                            text='.//passage/text/text()',
                                            id='.//id/text()')

# Parsing
corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)

# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

# Assigning each sentence to {train,dev,test}-set based on Stable ID

コード例 #14

0

ファイルを表示

%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.parser import TSVDocParser
from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path='data/proteincorpus_sm.tsv')
from snorkel.parser import SentenceParser

sent_parser = SentenceParser()

from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, 'Protein Training')

for name, path in [('Protein Development', 'data/protein_dev.tsv'),
                   ('Protein Test', 'data/protein_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()


from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Protein Training').one()
corpus

コード例 #15

0

ファイルを表示

ファイル: clinton_emails.py プロジェクト: vinodma/snorkel_projects

LFs = [LF_political_title, LF_title_left_window, LF_title_right_window, LF_no_title_in_sentence]


from snorkel import SnorkelSession
session = SnorkelSession()
import os

from snorkel.parser import TSVDocParser
doc_parser = TSVDocParser(path="data/clinton_train.tsv")

from snorkel.parser import SentenceParser

sent_parser = SentenceParser()
from snorkel.parser import CorpusParser

cp = CorpusParser(doc_parser, sent_parser)
%time corpus = cp.parse_corpus(session, "Emails Training")
session.add(corpus)
session.commit()


for name, path in [('Emails Development', 'data/clinton_dev.tsv'),
                   ('Emails Test', 'data/clinton_test.tsv')]:
    doc_parser.path=path
    %time corpus = cp.parse_corpus(session, name)
    session.commit()

sentences = set()
for document in corpus:
    for sentence in document.sentences:
        if number_of_people(sentence) < 5:

コード例 #16

0

ファイルを表示

ファイル: apply_large_scale_onset_label.py プロジェクト: xeniaqian94/snorkel

        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from collections import defaultdict
import numpy as np

session = SnorkelSession()
docs = session.query(Document).all()
sents = session.query(Sentence).all()  # get all sentences from snorkel.db

コード例 #17

0

ファイルを表示

ファイル: final_code.py プロジェクト: aaditya-thakkar/Crime_Extractor

session = SnorkelSession()
# Here, we just set a global variable related to automatic testing- you can safely ignore this!
max_docs = 50 if 'CI' in os.environ else float('inf')

# In[4]:

from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv',
                                      max_docs=max_docs)

# In[5]:

from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)')

# In[6]:

from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

dict_final = {}

crimetype_murder = [
    'killed', 'kill', 'kills', 'killing', 'murder', 'shot', 'shooting',
    'convicted', 'murdered'
]