def __call__(self, doc: Doc): save_parsed = doc.is_parsed doc.is_parsed = False if self.split_matcher: matches = self.split_matcher(doc) for match_id, start, end in matches: token = doc[end - 1] token.is_sent_start = True if end - 2 >= 0 and doc[end - 2].is_sent_start is True: doc[end - 2].is_sent_start = False if self.join_matcher: matches = self.join_matcher(doc) for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False if doc.is_sentenced: # Trim starting spaces sent_start = None for sent in doc.sents: sentlen = len(sent) first_non_space = 0 while first_non_space < sentlen and sent[ first_non_space].is_space: first_non_space += 1 if first_non_space > 0 and first_non_space < sentlen: sent[0].is_sent_start = False sent[first_non_space].is_sent_start = True doc.is_parsed = save_parsed if doc.is_sentenced else True return doc
def handle(self, *args, **options): spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat']) Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True) Span.set_extension('line_number', getter=Command.line_number_getter, force=True) Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines) Doc.set_extension('_lines', default=list()) logger.debug("Loaded spacy server") main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT) while True: readable, writeable, exceptions = select(read_socks, write_socks, []) for sockobj in readable: if sockobj in main_socks: new_sock, address = sockobj.accept() logger.debug('Connect: %s - %s', address, id(new_sock)) read_socks.append(new_sock) else: try: entities = [] data = recv_end(sockobj) if not data: sockobj.close() read_socks.remove(sockobj) else: for doc in spacy_model.pipe([data]): doc._.lines = [x.start() for x in re.finditer('\n', doc.text)] for ent in doc.ents: current_entity = self.get_ent(ent) entities.append(current_entity) if current_entity else None sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8')) except: pass
def test_match_zero(matcher): matcher.add('Quote', '', {}, [[{ 'ORTH': '"' }, { 'OP': '!', 'IS_PUNCT': True }, { 'OP': '!', 'IS_PUNCT': True }, { 'ORTH': '"' }]]) doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) assert len(matcher(doc)) == 1 doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split()) assert len(matcher(doc)) == 0 matcher.add('Quote', '', {}, [[{ 'ORTH': '"' }, { 'IS_PUNCT': True }, { 'IS_PUNCT': True }, { 'IS_PUNCT': True }, { 'ORTH': '"' }]]) assert len(matcher(doc)) == 0
def _make_span(self, doc: Doc, start: int, end: int, label: str, is_char: bool, retok: bool): span: Span if is_char: if label is None: span = doc.char_span(start, end) else: span = doc.char_span(start, end, label=label) else: if label is None: span = Span(doc, start, end) else: span = Span(doc, start, end, label=label) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'span ({start}, {end}) for {label}: {span}') if span is not None: # this is a span object or none if match doesn't map to valid token # sequence if logger.isEnabledFor(logging.DEBUG): logger.debug(f'match: {span.text}') if label is not None: doc.ents += (span, ) if retok: # https://github.com/explosion/spaCy/discussions/4806 with doc.retokenize() as retokenizer: # Iterate over all spans and merge them into one # token. This is done after setting the entities – # otherwise, it would cause mismatched indices! retokenizer.merge(span)
def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, **kwargs): """ @param nlp: Spacy Language model @param support_overlap: whether need to support overlapped annotations @param log_level: logging level configuration @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param kwargs:other parameters """ for param_name, value in kwargs.items(): setattr(self, param_name, value) if nlp is None: raise NameError('parameter "nlp" need to be defined') self.nlp = nlp self.encoding = encoding self.doc_name_depth = doc_name_depth self.support_overlap = support_overlap self.set_logger(log_level) if not Doc.has_extension('doc_name'): Doc.set_extension('doc_name', default='') pass
def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'): import spacy global nlp if not nlp: #print('>> loading spacy...') nlp = spacy.load(model_name) doc=None if self.parsed and load_from_file: #print self.fnfn_spacy from spacy.tokens.doc import Doc try: for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string) except UnicodeDecodeError: print("!! UNICODE ERROR:",self.fnfn_spacy) #else: if not doc: #print '>> making spacy document for text',self.id txt=self.text txt=clean_text(txt) doc=nlp(txt) return doc
def __init__(self): super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Token.has_extension('is_lexical'): Token.set_extension('is_lexical', default=False)
def add_span_extensions(): Doc.set_extension("relations", default=None) Doc.set_extension("entities", default=None) for span_extension in [ 'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest', 'hansardParent', 'snomedct', 'synonyms' ]: Span.set_extension(span_extension, default=None)
def make_doc_from_text_chunks(text, lang, chunk_size=100000): """ Make a single spaCy-processed document from 1 or more chunks of ``text``. This is a workaround for processing very long texts, for which spaCy is unable to allocate enough RAM. Although this function's performance is *pretty good*, it's inherently less performant that just processing the entire text in one shot. Only use it if necessary! Args: text (str): Text document to be chunked and processed by spaCy. lang (str or ``spacy.Language``): A 2-letter language code (e.g. "en"), the name of a spaCy model for the desired language, or an already-instantiated spaCy language pipeline. chunk_size (int): Number of characters comprising each text chunk (excluding the last chunk, which is probably smaller). For best performance, value should be somewhere between 1e3 and 1e7, depending on how much RAM you have available. .. note:: Since chunking is done by character, chunks edges' probably won't respect natural language segmentation, which means that every ``chunk_size`` characters, spaCy will probably get tripped up and make weird parsing errors. Returns: ``spacy.Doc``: A single processed document, initialized from components accumulated chunk by chunk. """ if isinstance(lang, compat.unicode_): lang = cache.load_spacy(lang) elif not isinstance(lang, SpacyLang): raise TypeError('`lang` must be {}, not {}'.format( {compat.unicode_, SpacyLang}, type(lang))) words = [] spaces = [] np_arrays = [] cols = [ attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE ] text_len = len(text) i = 0 # iterate over text chunks and accumulate components needed to make a doc while i < text_len: chunk_doc = lang(text[i:i + chunk_size]) words.extend(tok.text for tok in chunk_doc) spaces.extend(bool(tok.whitespace_) for tok in chunk_doc) np_arrays.append(chunk_doc.to_array(cols)) i += chunk_size # now, initialize the doc from words and spaces # then load attribute values from the concatenated np array doc = SpacyDoc(lang.vocab, words=words, spaces=spaces) doc = doc.from_array(cols, np.concatenate(np_arrays, axis=0)) return doc
def __init__(self, clf, extension='score'): """ :type clf: Classifier, needs to have a predict(X) function """ self.clf = clf self.extension = extension if not Doc.has_extension(extension): Doc.set_extension(extension, default=-1)
def read_docs(filepath): """Deserialize a list of documents + associated metadata""" spacy_parser = get_spacy_parser() data = pickle.load(open(filepath, 'rb')) for row in data: doc = Doc(spacy_parser.vocab) # read doc object from serialized byte array row['content'] = doc.from_bytes(row.pop('binary_content')) return data
def test_efficient_binary_serialization(doc): from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) nlp = spacy.en.English() for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string)
def test_read_bytes(nlp): from spacy.tokens.doc import Doc loc = 'test_serialize.bin' with open(loc, 'wb') as file_: file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2
def deserialize_dataset(file_path, max_items): vocab = spacy.load('en_default').vocab with open(file_path, 'rb') as input_file: for data_point in pickle.load(input_file)[:max_items]: yield { 'question1': Doc(vocab).from_bytes(data_point['question1']), 'question2': Doc(vocab).from_bytes(data_point['question2']), 'id': data_point['id'], 'is_duplicate': data_point.get('is_duplicate', None) }
def load_and_transform(batch_id, in_loc, out_dir): out_loc = path.join(out_dir, '%d.txt' % batch_id) if path.exists(out_loc): return None print('Batch', batch_id) nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False) with io.open(out_loc, 'w', encoding='utf8') as out_file: with io.open(in_loc, 'rb') as in_file: for byte_string in Doc.read_bytes(in_file): doc = Doc(nlp.vocab).from_bytes(byte_string) doc.is_parsed = True out_file.write(transform_doc(doc))
def test_docs_to_sents_df(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader(nlp=self.nlp, support_overlap=False, recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') df = Vectorizer.docs_to_sents_df(docs, type_filter=set(), track_doc_name=True) print(df) assert (df.shape[0] == 12) df = Vectorizer.docs_to_sents_df(docs, type_filter=set()) print(df) df = Vectorizer.docs_to_sents_df(docs, sent_window=2) assert (df.shape[0] == 20)
def __iter__(self, week=None): with open(self.path + ".info") as info: with open(self.path + ".title.bin") as title_bin: for byte_string in Doc.read_bytes(title_bin): info_line = info.readline() comment_info = self._parse_info(info_line) if not (week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string) yield comment_info
def make_docs(nlp, batch, heads=True): docs = [] for record in batch: text = record["text"] if "tokens" in record: doc = Doc(nlp.vocab, words=record["tokens"]) else: doc = nlp.make_doc(text) if "heads" in record: heads = record["heads"] heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) if len(doc) >= 1 and len(doc) < 200: docs.append(doc) return docs
def get_tokens(model: Language, doc_id: int): fn = os.path.join(settings.TOKEN_DIR, str(doc_id)) if not os.path.exists(fn): raise ValueError( "Document {doc_id} has not been preprocessed ({fn} does not exist)" .format(**locals())) return Doc(model.vocab).from_disk(fn)
def __iter__(self, week=None): with open(self.path + ".bin", "rb") as bin: with open(self.path + ".info") as info: for byte_string in Doc.read_bytes(bin): comment_info = self._parse_info(info.next()) if (not week is None) and get_week(comment_info["timestamp"]) != week: continue if self.clean_deleted and comment_info["author"] == "[deleted]": continue if self.clean_bots and (is_bot(comment_info["author"]) or comment_info["author"] in FILTERED_USERS): continue doc = Doc(self._vocab).from_bytes(byte_string) comment_info["doc"] = doc comment_info["text"] = self._text_from_doc(doc) yield comment_info
def decode(self, output: TaskOutput) -> TaskOutput: # The dims are: batch, top_k, tags output.tags: List[List[List[str]]] = [ self._decode_tags(paths) for paths in output.viterbi_paths ] output.scores: List[List[float]] = [[score for tags, score in paths] for paths in output.viterbi_paths] output.entities: List[List[List[Dict]]] = [] output.tokens: List[List[Dict]] = [] # iterate over batch for raw_text, k_tags in zip(output.raw_text, output.tags): pre_tokenized = not isinstance(raw_text, str) if pre_tokenized: # compose spacy doc from tokens doc = Doc(Vocab(), words=raw_text) else: doc = self.backbone.tokenizer.nlp(raw_text) output.entities.append( self._decode_entities(doc, k_tags, pre_tokenized)) output.tokens.append( self._decode_tokens(doc) if not pre_tokenized else None) if not any(output.tokens): # drop tokens field if no data del output.tokens del output.logits del output.mask del output.probs del output.raw_text del output.viterbi_paths return output
def test_get_entity_via_match(en_vocab): matcher = Matcher(en_vocab) matcher.add_entity('TestEntity', attrs={u'Hello': u'World'}) assert matcher.n_patterns == 0 assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == [] matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}]) assert matcher.n_patterns == 1 matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) assert len(matches) == 1 assert len(matches[0]) == 4 ent_id, label, start, end = matches[0] assert ent_id == matcher.vocab.strings[u'TestEntity'] assert label == 0 assert start == 0 assert end == 2 attrs = matcher.get_entity(ent_id) assert attrs == {u'Hello': u'World'}
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): doc = Doc(en_parser.vocab, words=text) assert len(doc) == length with en_parser.step_through(doc) as _: # noqa: F841 pass assert doc[0].is_space for token in doc: assert token.head.i == length - 1
def extract_entity(self, doc: Doc) -> List[Span]: food_spans = [] for food in self.food_names: food_index = doc.text.lower().find(food) if food_index > -1: food_spans.append( doc.char_span(food_index, food_index + len(food))) return food_spans
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def test_match_zero_plus(matcher): matcher.add('Quote', '', {}, [ [ {'ORTH': '"'}, {'OP': '*', 'IS_PUNCT': False}, {'ORTH': '"'} ]]) doc = Doc(matcher.vocab, 'He said , " some words " ...'.split()) assert len(matcher(doc)) == 1
def forward(texts, drop=0.): if tokenized: docs = [Doc(nlp.vocab, words) for words in texts] else: docs = [nlp(text) for text in texts] features = [doc.to_array(attrs) for doc in docs] def backward(d_features, sgd=None): return d_features return features, backward
def __call__(self, doc : Doc): save_parsed = doc.is_parsed doc.is_parsed = False if self.split_matcher: matches = self.split_matcher(doc) for match_id, start, end in matches: token = doc[end-1] token.is_sent_start = True if end-2>=0 and doc[end-2].is_sent_start is True: doc[end-2].is_sent_start = False if self.join_matcher: matches = self.join_matcher(doc) for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False doc.is_parsed = save_parsed if doc.is_sentenced else True return doc
def process_non_content_bearing_samples( self, empty_samples: List[Tuple[int, Text]]) -> List[Tuple[int, "Doc"]]: """Creates empty Doc-objects from zero-lengthed training samples strings.""" from spacy.tokens import Doc n_docs = [(empty_sample[0], doc) for empty_sample, doc in zip( empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples])] return n_docs
def test_override_sentiment(EN): '''Test new span.sentiment property's default averaging behaviour''' good = EN.vocab[u'good'] good.sentiment = 3.0 bad = EN.vocab[u'bad'] bad.sentiment = -2.0 doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff']) doc.user_span_hooks['sentiment'] = lambda span: 10.0 good_stuff = doc[:2] assert good_stuff.sentiment == 10.0 bad_stuff = doc[-2:] assert bad_stuff.sentiment == 10.0 good_stuff_bad = doc[:-1] assert good_stuff_bad.sentiment == 10.0
def test_read_bytes(nlp): from spacy.tokens.doc import Doc loc = '/tmp/test_serialize.bin' with open(loc, 'wb') as file_: file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2
def read_spacy_docs(spacy_vocab, filename): """ Stream ``spacy.Doc`` s from disk at ``filename`` where they were serialized using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize the docs in ``filename`` filename (str): /path/to/file on disk from which spacy docs will be streamed Yields: the next deserialized ``spacy.Doc`` """ with io.open(filename, mode='rb') as f: for bytes_string in SpacyDoc.read_bytes(f): yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
dep_labels = [] while token.head is not token: dep_labels.append(token.dep) token = token.head return dep_labels for sentence in doc.sents: for token in sentence: print token print token.orth dep_labels = dependency_labels_to_root(token) print dep_labels for dep_label in dep_labels: print nlp.vocab.strings[dep_label] doc = nlp(u"Mr. Best flew to New York on Saturday morning.") for ent in doc.ents: print ent, ent.label_, ent.orth_ print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_ from spacy.tokens.doc import Doc byte_string = doc.to_bytes() open('moby_dick.bin', 'wb').write(byte_string) doc = Doc(nlp.vocab) for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): doc.from_bytes(byte_string) print doc