def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'): import spacy global nlp if not nlp: #print('>> loading spacy...') nlp = spacy.load(model_name) doc=None if self.parsed and load_from_file: #print self.fnfn_spacy from spacy.tokens.doc import Doc try: for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string) except UnicodeDecodeError: print("!! UNICODE ERROR:",self.fnfn_spacy) #else: if not doc: #print '>> making spacy document for text',self.id txt=self.text txt=clean_text(txt) doc=nlp(txt) return doc
def test_efficient_binary_serialization(doc): from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) nlp = spacy.en.English() for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string)
def read_docs(filepath): """Deserialize a list of documents + associated metadata""" spacy_parser = get_spacy_parser() data = pickle.load(open(filepath, 'rb')) for row in data: doc = Doc(spacy_parser.vocab) # read doc object from serialized byte array row['content'] = doc.from_bytes(row.pop('binary_content')) return data
dep_labels = [] while token.head is not token: dep_labels.append(token.dep) token = token.head return dep_labels for sentence in doc.sents: for token in sentence: print token print token.orth dep_labels = dependency_labels_to_root(token) print dep_labels for dep_label in dep_labels: print nlp.vocab.strings[dep_label] doc = nlp(u"Mr. Best flew to New York on Saturday morning.") for ent in doc.ents: print ent, ent.label_, ent.orth_ print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_ from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) doc = Doc(nlp.vocab) for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc.from_bytes(byte_string) print doc
def read_doc(spacy_fname, nlp): print('reading ' + spacy_fname) byte_string = open(spacy_fname, 'rb').read() doc = Doc(nlp.vocab) doc.from_bytes(byte_string) return doc