Ejemplo n.º 1
0
def doc_creation(df_features, session):
    # write the subset to a .csv and convert it to a .tsv file
    df_features.to_csv('dataset.csv', header=False)
    #
    csv.writer(open('dataset.tsv', 'w+'),
               delimiter='	').writerows(csv.reader(open("dataset.csv")))
    doc_preprocessor = TSVDocPreprocessor('dataset.tsv')
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor)
    print("Documents:", session.query(Document).count())
    print("Sentences:", session.query(Sentence).count())
Ejemplo n.º 2
0
def docs_to_sentences():
	# Must set SNORKELDB before importing SnorkelSession
	from snorkel import SnorkelSession
	from snorkel.parser import TSVDocPreprocessor
	from snorkel.parser import CorpusParser
	from snorkel.models import Document, Sentence
	session = SnorkelSession()

	pathname = 'small_data/data_400.tsv' if os.environ['AGP_DATA_SIZE'] == 'small-data' else 'data/full_pp.tsv'
	doc_preprocessor = TSVDocPreprocessor(pathname)

	corpus_parser = CorpusParser()
	corpus_parser.apply(doc_preprocessor, parallelism=multiprocessing.cpu_count())

	print "Documents:", session.query(Document).count()
	print "Sentences:", session.query(Sentence).count()
Ejemplo n.º 3
0
def doc_parse(path):
    """
    Loads TSV file and parses to Snorkel Contexts
    :param path: Path to TSV file
    :return: None
    """
    try:
        doc_preprocessor = TSVDocPreprocessor(path, encoding=u'utf-8', max_docs=2500)

        corpus_parser = CorpusParser()
        corpus_parser.apply(doc_preprocessor)
        print("Documents:", session.query(Document).count())
        print("Sentences:", session.query(Sentence).count())

    except Exception:
        print('Error loading TSV file')
    'SNORKELHOME'] + "../semantic/hard-drive-slim-top-tier-citation-" + str(
        train_bucket)
docID = set()
newfile = os.environ[
    'SNORKELHOME'] + "../semantic/hard-drive-slim-top-tier-citation-" + str(
        train_bucket) + "-new"
with open(newfile, "w") as fout:
    for line in open(filename, "r", errors='ignore').readlines():
        if line.split("\t")[0] in docID or len(line.split("\t")) != 2:
            continue
        docID.add(line.split("\t")[0])
        fout.write(line.replace("\n", " ").strip() + "\n")

print("total docID count", len(docID))
doc_preprocessor = TSVDocPreprocessor(newfile,
                                      encoding="utf-8",
                                      max_docs=n_docs)

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence  # defined in context.py file

if session.query(Document).count() == 0:
    corpus_parser = CorpusParser(parser=Spacy())
    corpus_parser.apply(doc_preprocessor, count=n_docs)  # ,parallelism=5)

print("Documents:", session.query(Document).count())

from snorkel import SnorkelSession
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
Ejemplo n.º 5
0
    'Cherry', 'cherry', 'ihc', 'IHC', '(IHC)', 'hit'
]

for word in dont_want2:
    if word in virus_list:
        virus_list.remove(word)

# ------------------------------------------

# START SNORKEL SESSION

session = SnorkelSession()

n_docs = 500

doc_preprocessor = TSVDocPreprocessor('pdfs_big.tsv',
                                      max_docs=n_docs)  # new files (88 papers)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor, count=n_docs)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d=virus_list)
animals_matcher = DictionaryMatch(d=animals_list)
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams],
                                    [virus_matcher, animals_matcher],
                                    nested_relations=True)

docs = session.query(Document).order_by(Document.name).all()

# Text Pattern based labeling functions, which look for certain keywords


##### LIST OF LF FUNCTIONS TO CHECK


LFs=[LF_edit_index,LF_recall_projections2,LF_jackard_index]
#LFs=[LF_edit_index,LF_jackard_index]


##### snorkeling


session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(path)

corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)


pairs = candidate_subclass('pairs1', ['queryPair'])
regexpmatch=RegexMatchSpan(rgx=".*")
cs=queryCandidate()
cand_extractor = CandidateExtractor(pairs, [cs], [regexpmatch])


docs = session.query(Document).order_by(Document.name).all()
sentences = session.query(Sentence).all()
#print(sentences)
Ejemplo n.º 7
0
from snorkel.lf_helpers import (
    get_tagged_text,
    rule_regex_search_tagged_text,
    rule_regex_search_btw_AB,
    rule_regex_search_btw_BA,
    rule_regex_search_before_A,
    rule_regex_search_before_B,
)

# A ContextSpace defines the "space" of all candidates we even potentially consider; in this case we use the Ngrams subclass, and look for all n-grams up to 7 words long

session = SnorkelSession()

doc_preprocessor = TSVDocPreprocessor(
    '/Users/fanglinchen/Desktop/PersonalDataStack/DeepScrub/DeepScrub/algorithms/input.tsv',
    max_docs=350)
corpus_parser = CorpusParser(parser=Spacy())
corpus_parser.apply(doc_preprocessor)

Sensitive = candidate_subclass('Sensitive', ['sensitive'],
                               values=[
                                   'person', 'job', 'event', 'place', 'date',
                                   'time', 'product', 'email', 'phone',
                                   'quantity', 'address', 'url', 'org', 'file',
                                   'password', False
                               ])
# generating candidates.
ngrams = Ngrams(n_max=6)
ngramMatcher = NgramMatcher(longest_match_only=False)
Ejemplo n.º 8
0
import os

# TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE
# Note that this is necessary for parallel execution amongst other things...
# os.environ['SNORKELDB'] = 'postgres:///snorkel-intro'

from snorkel import SnorkelSession
session = SnorkelSession()
# Here, we just set a global variable related to automatic testing- you can safely ignore this!
max_docs = 50 if 'CI' in os.environ else float('inf')

# In[4]:

from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor('tutorials/intro/data/articles.tsv',
                                      max_docs=max_docs)

# In[5]:

from snorkel.parser import CorpusParser

corpus_parser = CorpusParser()
get_ipython().magic(u'time corpus_parser.apply(doc_preprocessor)')

# In[6]:

from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()