Exemple #1
0
	def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'):
		import spacy
		global nlp
		if not nlp:
			#print('>> loading spacy...')
			nlp = spacy.load(model_name)

		doc=None
		if self.parsed and load_from_file:
			#print self.fnfn_spacy
			from spacy.tokens.doc import Doc

			try:
				for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')):
					doc = Doc(nlp.vocab)
					doc.from_bytes(byte_string)
			except UnicodeDecodeError:
				print("!! UNICODE ERROR:",self.fnfn_spacy)
		#else:

		if not doc:
			#print '>> making spacy document for text',self.id
			txt=self.text
			txt=clean_text(txt)
			doc=nlp(txt)

		return doc
Exemple #2
0
def test_efficient_binary_serialization(doc):
    from spacy.tokens.doc import Doc

    byte_string = doc.to_bytes()
    open('moby_dick.bin', 'wb').write(byte_string)

    nlp = spacy.en.English()
    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
       doc = Doc(nlp.vocab)
       doc.from_bytes(byte_string)
Exemple #3
0
def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = '/tmp/test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
Exemple #4
0
def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
Exemple #5
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc)) 
Exemple #6
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc))
Exemple #7
0
 def __iter__(self, week=None):
     with open(self.path + ".info")  as info:
         with open(self.path + ".title.bin") as title_bin:
             for byte_string in Doc.read_bytes(title_bin):
                 info_line = info.readline()
                 comment_info = self._parse_info(info_line)
                 if not (week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string)
                 yield comment_info
Exemple #8
0
def read_spacy_docs(spacy_vocab, filepath):
    """
    Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized
    using Spacy's ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize
            the docs in ``filepath``
        filepath (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        the next deserialized ``spacy.Doc``
    """
    with open_sesame(filepath, mode='rb') as f:
        for bytes_string in SpacyDoc.read_bytes(f):
            yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
Exemple #9
0
def read_spacy_docs(spacy_vocab, filename):
    """
    Stream ``spacy.Doc`` s from disk at ``filename`` where they were serialized
    using Spacy's ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize
            the docs in ``filename``
        filename (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        the next deserialized ``spacy.Doc``
    """
    with io.open(filename, mode='rb') as f:
        for bytes_string in SpacyDoc.read_bytes(f):
            yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
Exemple #10
0
 def __iter__(self, week=None):
     with open(self.path + ".bin", "rb") as bin:
         with open(self.path + ".info")  as info:
             for byte_string in Doc.read_bytes(bin):
                 comment_info = self._parse_info(info.next())
                 if (not week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 doc = Doc(self._vocab).from_bytes(byte_string)
                 comment_info["doc"] = doc
                 comment_info["text"] = self._text_from_doc(doc)
                 yield comment_info
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels

for sentence in doc.sents:
    for token in sentence:
        print token
        print token.orth
        dep_labels = dependency_labels_to_root(token)
        print dep_labels
        for dep_label in dep_labels:
            print nlp.vocab.strings[dep_label]

doc = nlp(u"Mr. Best flew to New York on Saturday morning.")

for ent in doc.ents:
    print ent, ent.label_, ent.orth_
    print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_

from spacy.tokens.doc import Doc

byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)

doc = Doc(nlp.vocab)
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
    doc.from_bytes(byte_string)
print doc
Exemple #12
0
def read_doc(spacy_fname, nlp):
    print('reading ' + spacy_fname)
    byte_string = next(Doc.read_bytes(open(spacy_fname, 'rb')))
    doc = Doc(nlp.vocab)
    doc.from_bytes(byte_string)
    return doc