def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'): import spacy global nlp if not nlp: #print('>> loading spacy...') nlp = spacy.load(model_name) doc=None if self.parsed and load_from_file: #print self.fnfn_spacy from spacy.tokens.doc import Doc try: for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string) except UnicodeDecodeError: print("!! UNICODE ERROR:",self.fnfn_spacy) #else: if not doc: #print '>> making spacy document for text',self.id txt=self.text txt=clean_text(txt) doc=nlp(txt) return doc
def test_efficient_binary_serialization(doc): from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) nlp = spacy.en.English() for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string)
def test_read_bytes(nlp): from spacy.tokens.doc import Doc loc = '/tmp/test_serialize.bin' with open(loc, 'wb') as file_: file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2
def test_read_bytes(nlp): from spacy.tokens.doc import Doc loc = 'test_serialize.bin' with open(loc, 'wb') as file_: file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2
def load_and_transform(batch_id, in_loc, out_dir): out_loc = path.join(out_dir, '%d.txt' % batch_id) if path.exists(out_loc): return None print('Batch', batch_id) nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False) with io.open(out_loc, 'w', encoding='utf8') as out_file: with io.open(in_loc, 'rb') as in_file: for byte_string in Doc.read_bytes(in_file): doc = Doc(nlp.vocab).from_bytes(byte_string) doc.is_parsed = True out_file.write(transform_doc(doc))
def __iter__(self, week=None): with open(self.path + ".info") as info: with open(self.path + ".title.bin") as title_bin: for byte_string in Doc.read_bytes(title_bin): info_line = info.readline() comment_info = self._parse_info(info_line) if not (week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string) yield comment_info
def read_spacy_docs(spacy_vocab, filepath): """ Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize the docs in ``filepath`` filepath (str): /path/to/file on disk from which spacy docs will be streamed Yields: the next deserialized ``spacy.Doc`` """ with open_sesame(filepath, mode='rb') as f: for bytes_string in SpacyDoc.read_bytes(f): yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
def read_spacy_docs(spacy_vocab, filename): """ Stream ``spacy.Doc`` s from disk at ``filename`` where they were serialized using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize the docs in ``filename`` filename (str): /path/to/file on disk from which spacy docs will be streamed Yields: the next deserialized ``spacy.Doc`` """ with io.open(filename, mode='rb') as f: for bytes_string in SpacyDoc.read_bytes(f): yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
def __iter__(self, week=None): with open(self.path + ".bin", "rb") as bin: with open(self.path + ".info") as info: for byte_string in Doc.read_bytes(bin): comment_info = self._parse_info(info.next()) if (not week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue doc = Doc(self._vocab).from_bytes(byte_string) comment_info["doc"] = doc comment_info["text"] = self._text_from_doc(doc) yield comment_info
dep_labels = [] while token.head is not token: dep_labels.append(token.dep) token = token.head return dep_labels for sentence in doc.sents: for token in sentence: print token print token.orth dep_labels = dependency_labels_to_root(token) print dep_labels for dep_label in dep_labels: print nlp.vocab.strings[dep_label] doc = nlp(u"Mr. Best flew to New York on Saturday morning.") for ent in doc.ents: print ent, ent.label_, ent.orth_ print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_ from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) doc = Doc(nlp.vocab) for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc.from_bytes(byte_string) print doc
def read_doc(spacy_fname, nlp): print('reading ' + spacy_fname) byte_string = next(Doc.read_bytes(open(spacy_fname, 'rb'))) doc = Doc(nlp.vocab) doc.from_bytes(byte_string) return doc