Example #1
0
    def __call__(self, doc: Doc):
        save_parsed = doc.is_parsed
        doc.is_parsed = False
        if self.split_matcher:
            matches = self.split_matcher(doc)
            for match_id, start, end in matches:
                token = doc[end - 1]
                token.is_sent_start = True
                if end - 2 >= 0 and doc[end - 2].is_sent_start is True:
                    doc[end - 2].is_sent_start = False
        if self.join_matcher:
            matches = self.join_matcher(doc)
            for match_id, start, end in matches:
                # If there is a sent start in the match, just remove it
                for token in doc[start:end]:
                    if token.is_sent_start:
                        token.is_sent_start = False
        if doc.is_sentenced:
            # Trim starting spaces
            sent_start = None
            for sent in doc.sents:
                sentlen = len(sent)
                first_non_space = 0
                while first_non_space < sentlen and sent[
                        first_non_space].is_space:
                    first_non_space += 1
                if first_non_space > 0 and first_non_space < sentlen:
                    sent[0].is_sent_start = False
                    sent[first_non_space].is_sent_start = True

        doc.is_parsed = save_parsed if doc.is_sentenced else True
        return doc
Example #2
0
    def handle(self, *args, **options):
        spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat'])
        Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True)
        Span.set_extension('line_number', getter=Command.line_number_getter, force=True)
        Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines)
        Doc.set_extension('_lines', default=list())

        logger.debug("Loaded spacy server")
        main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT)
        while True:
            readable, writeable, exceptions = select(read_socks, write_socks, [])
            for sockobj in readable:
                if sockobj in main_socks:
                    new_sock, address = sockobj.accept()
                    logger.debug('Connect: %s - %s', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        entities = []
                        data = recv_end(sockobj)
                        if not data:
                            sockobj.close()
                            read_socks.remove(sockobj)
                        else:
                            for doc in spacy_model.pipe([data]):
                                doc._.lines = [x.start() for x in re.finditer('\n', doc.text)]
                                for ent in doc.ents:
                                    current_entity = self.get_ent(ent)
                                    entities.append(current_entity) if current_entity else None

                            sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8'))
                    except:
                        pass
Example #3
0
def test_match_zero(matcher):
    matcher.add('Quote', '', {}, [[{
        'ORTH': '"'
    }, {
        'OP': '!',
        'IS_PUNCT': True
    }, {
        'OP': '!',
        'IS_PUNCT': True
    }, {
        'ORTH': '"'
    }]])
    doc = Doc(matcher.vocab, words='He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
    doc = Doc(matcher.vocab,
              words='He said , " some three words " ...'.split())
    assert len(matcher(doc)) == 0
    matcher.add('Quote', '', {}, [[{
        'ORTH': '"'
    }, {
        'IS_PUNCT': True
    }, {
        'IS_PUNCT': True
    }, {
        'IS_PUNCT': True
    }, {
        'ORTH': '"'
    }]])
    assert len(matcher(doc)) == 0
Example #4
0
 def _make_span(self, doc: Doc, start: int, end: int, label: str,
                is_char: bool, retok: bool):
     span: Span
     if is_char:
         if label is None:
             span = doc.char_span(start, end)
         else:
             span = doc.char_span(start, end, label=label)
     else:
         if label is None:
             span = Span(doc, start, end)
         else:
             span = Span(doc, start, end, label=label)
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'span ({start}, {end}) for {label}: {span}')
     if span is not None:
         # this is a span object or none if match doesn't map to valid token
         # sequence
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(f'match: {span.text}')
         if label is not None:
             doc.ents += (span, )
         if retok:
             # https://github.com/explosion/spaCy/discussions/4806
             with doc.retokenize() as retokenizer:
                 # Iterate over all spans and merge them into one
                 # token. This is done after setting the entities –
                 # otherwise, it would cause mismatched indices!
                 retokenizer.merge(span)
Example #5
0
    def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param kwargs:other parameters
        """
        for param_name, value in kwargs.items():
            setattr(self, param_name, value)
        if nlp is None:
            raise NameError('parameter "nlp" need to be defined')
        self.nlp = nlp
        self.encoding = encoding
        self.doc_name_depth = doc_name_depth
        self.support_overlap = support_overlap
        self.set_logger(log_level)
        if not Doc.has_extension('doc_name'):
            Doc.set_extension('doc_name', default='')
        pass
Example #6
0
	def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'):
		import spacy
		global nlp
		if not nlp:
			#print('>> loading spacy...')
			nlp = spacy.load(model_name)

		doc=None
		if self.parsed and load_from_file:
			#print self.fnfn_spacy
			from spacy.tokens.doc import Doc

			try:
				for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')):
					doc = Doc(nlp.vocab)
					doc.from_bytes(byte_string)
			except UnicodeDecodeError:
				print("!! UNICODE ERROR:",self.fnfn_spacy)
		#else:

		if not doc:
			#print '>> making spacy document for text',self.id
			txt=self.text
			txt=clean_text(txt)
			doc=nlp(txt)

		return doc
    def __init__(self):
        super().__init__()

        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Token.has_extension('is_lexical'):
            Token.set_extension('is_lexical', default=False)
Example #8
0
def add_span_extensions():
    Doc.set_extension("relations", default=None)
    Doc.set_extension("entities", default=None)
    for span_extension in [
            'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest',
            'hansardParent', 'snomedct', 'synonyms'
    ]:
        Span.set_extension(span_extension, default=None)
Example #9
0
def make_doc_from_text_chunks(text, lang, chunk_size=100000):
    """
    Make a single spaCy-processed document from 1 or more chunks of ``text``.
    This is a workaround for processing very long texts, for which spaCy
    is unable to allocate enough RAM.

    Although this function's performance is *pretty good*, it's inherently
    less performant that just processing the entire text in one shot.
    Only use it if necessary!

    Args:
        text (str): Text document to be chunked and processed by spaCy.
        lang (str or ``spacy.Language``): A 2-letter language code (e.g. "en"),
            the name of a spaCy model for the desired language, or
            an already-instantiated spaCy language pipeline.
        chunk_size (int): Number of characters comprising each text chunk
            (excluding the last chunk, which is probably smaller). For best
            performance, value should be somewhere between 1e3 and 1e7,
            depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy will probably get tripped up and
               make weird parsing errors.

    Returns:
        ``spacy.Doc``: A single processed document, initialized from
        components accumulated chunk by chunk.
    """
    if isinstance(lang, compat.unicode_):
        lang = cache.load_spacy(lang)
    elif not isinstance(lang, SpacyLang):
        raise TypeError('`lang` must be {}, not {}'.format(
            {compat.unicode_, SpacyLang}, type(lang)))

    words = []
    spaces = []
    np_arrays = []
    cols = [
        attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB,
        attrs.ENT_TYPE
    ]
    text_len = len(text)
    i = 0
    # iterate over text chunks and accumulate components needed to make a doc
    while i < text_len:
        chunk_doc = lang(text[i:i + chunk_size])
        words.extend(tok.text for tok in chunk_doc)
        spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
        np_arrays.append(chunk_doc.to_array(cols))
        i += chunk_size
    # now, initialize the doc from words and spaces
    # then load attribute values from the concatenated np array
    doc = SpacyDoc(lang.vocab, words=words, spaces=spaces)
    doc = doc.from_array(cols, np.concatenate(np_arrays, axis=0))

    return doc
Example #10
0
    def __init__(self, clf, extension='score'):
        """

        :type clf: Classifier, needs to have a predict(X) function
        """
        self.clf = clf
        self.extension = extension
        if not Doc.has_extension(extension):
            Doc.set_extension(extension, default=-1)
Example #11
0
def read_docs(filepath):
    """Deserialize a list of documents + associated metadata"""
    spacy_parser = get_spacy_parser()
    data = pickle.load(open(filepath, 'rb'))
    for row in data:
        doc = Doc(spacy_parser.vocab)
        # read doc object from serialized byte array
        row['content'] = doc.from_bytes(row.pop('binary_content'))
    return data
Example #12
0
def test_efficient_binary_serialization(doc):
    from spacy.tokens.doc import Doc

    byte_string = doc.to_bytes()
    open('moby_dick.bin', 'wb').write(byte_string)

    nlp = spacy.en.English()
    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
       doc = Doc(nlp.vocab)
       doc.from_bytes(byte_string)
Example #13
0
def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
def deserialize_dataset(file_path, max_items):
    vocab = spacy.load('en_default').vocab

    with open(file_path, 'rb') as input_file:
        for data_point in pickle.load(input_file)[:max_items]:
            yield {
                'question1': Doc(vocab).from_bytes(data_point['question1']),
                'question2': Doc(vocab).from_bytes(data_point['question2']),
                'id': data_point['id'],
                'is_duplicate': data_point.get('is_duplicate', None)
            }
Example #15
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc))
Example #16
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc)) 
Example #17
0
    def test_docs_to_sents_df(self):
        if Doc.has_extension("concepts"):
            Doc.remove_extension("concepts")
        dir_reader = EhostDirReader(nlp=self.nlp, support_overlap=False,
                                    recursive=True,
                                    schema_file='data/ehost_test_corpus/config/projectschema.xml')

        docs = dir_reader.read(txt_dir='data/ehost_test_corpus/')
        df = Vectorizer.docs_to_sents_df(docs, type_filter=set(), track_doc_name=True)
        print(df)
        assert (df.shape[0] == 12)
        df = Vectorizer.docs_to_sents_df(docs, type_filter=set())
        print(df)
        df = Vectorizer.docs_to_sents_df(docs, sent_window=2)
        assert (df.shape[0] == 20)
Example #18
0
 def __iter__(self, week=None):
     with open(self.path + ".info")  as info:
         with open(self.path + ".title.bin") as title_bin:
             for byte_string in Doc.read_bytes(title_bin):
                 info_line = info.readline()
                 comment_info = self._parse_info(info_line)
                 if not (week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string)
                 yield comment_info
Example #19
0
def make_docs(nlp, batch, heads=True):
    docs = []
    for record in batch:
        text = record["text"]
        if "tokens" in record:
            doc = Doc(nlp.vocab, words=record["tokens"])
        else:
            doc = nlp.make_doc(text)
        if "heads" in record:
            heads = record["heads"]
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
        if len(doc) >= 1 and len(doc) < 200:
            docs.append(doc)
    return docs
Example #20
0
def get_tokens(model: Language, doc_id: int):
    fn = os.path.join(settings.TOKEN_DIR, str(doc_id))
    if not os.path.exists(fn):
        raise ValueError(
            "Document {doc_id} has not been preprocessed ({fn} does not exist)"
            .format(**locals()))
    return Doc(model.vocab).from_disk(fn)
Example #21
0
 def __iter__(self, week=None):
     with open(self.path + ".bin", "rb") as bin:
         with open(self.path + ".info")  as info:
             for byte_string in Doc.read_bytes(bin):
                 comment_info = self._parse_info(info.next())
                 if (not week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 doc = Doc(self._vocab).from_bytes(byte_string)
                 comment_info["doc"] = doc
                 comment_info["text"] = self._text_from_doc(doc)
                 yield comment_info
Example #22
0
    def decode(self, output: TaskOutput) -> TaskOutput:
        # The dims are: batch, top_k, tags
        output.tags: List[List[List[str]]] = [
            self._decode_tags(paths) for paths in output.viterbi_paths
        ]
        output.scores: List[List[float]] = [[score for tags, score in paths]
                                            for paths in output.viterbi_paths]

        output.entities: List[List[List[Dict]]] = []
        output.tokens: List[List[Dict]] = []
        # iterate over batch
        for raw_text, k_tags in zip(output.raw_text, output.tags):
            pre_tokenized = not isinstance(raw_text, str)
            if pre_tokenized:
                # compose spacy doc from tokens
                doc = Doc(Vocab(), words=raw_text)
            else:
                doc = self.backbone.tokenizer.nlp(raw_text)

            output.entities.append(
                self._decode_entities(doc, k_tags, pre_tokenized))
            output.tokens.append(
                self._decode_tokens(doc) if not pre_tokenized else None)

        if not any(output.tokens):  # drop tokens field if no data
            del output.tokens

        del output.logits
        del output.mask
        del output.probs
        del output.raw_text
        del output.viterbi_paths

        return output
Example #23
0
def test_get_entity_via_match(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity', attrs={u'Hello': u'World'})
    assert matcher.n_patterns == 0
    assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
    matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}])
    assert matcher.n_patterns == 1
    matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity']))
    assert len(matches) == 1
    assert len(matches[0]) == 4
    ent_id, label, start, end = matches[0]
    assert ent_id == matcher.vocab.strings[u'TestEntity']
    assert label == 0
    assert start == 0
    assert end == 2
    attrs = matcher.get_entity(ent_id)
    assert attrs == {u'Hello': u'World'}
Example #24
0
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
    doc = Doc(en_parser.vocab, words=text)
    assert len(doc) == length
    with en_parser.step_through(doc) as _:  # noqa: F841
        pass
    assert doc[0].is_space
    for token in doc:
        assert token.head.i == length - 1
Example #25
0
 def extract_entity(self, doc: Doc) -> List[Span]:
     food_spans = []
     for food in self.food_names:
         food_index = doc.text.lower().find(food)
         if food_index > -1:
             food_spans.append(
                 doc.char_span(food_index, food_index + len(food)))
     return food_spans
Example #26
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Example #27
0
def test_match_zero_plus(matcher):
    matcher.add('Quote', '', {}, [
        [
            {'ORTH': '"'},
            {'OP': '*', 'IS_PUNCT': False},
            {'ORTH': '"'}
        ]])
    doc = Doc(matcher.vocab, 'He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
Example #28
0
 def forward(texts, drop=0.):
     if tokenized:
         docs = [Doc(nlp.vocab, words) for words in texts]
     else:
         docs = [nlp(text) for text in texts]
     features = [doc.to_array(attrs) for doc in docs]
     def backward(d_features, sgd=None):
         return d_features
     return features, backward
Example #29
0
 def __call__(self, doc : Doc):
     save_parsed = doc.is_parsed
     doc.is_parsed = False
     if self.split_matcher:
         matches = self.split_matcher(doc)
         for match_id, start, end in matches:
             token = doc[end-1]
             token.is_sent_start = True
             if end-2>=0 and doc[end-2].is_sent_start is True:
                 doc[end-2].is_sent_start = False
     if self.join_matcher:
         matches = self.join_matcher(doc)
         for match_id, start, end in matches:
             # If there is a sent start in the match, just remove it
             for token in doc[start:end]:
                 if token.is_sent_start:
                     token.is_sent_start = False
     doc.is_parsed = save_parsed if doc.is_sentenced else True
     return doc
Example #30
0
    def process_non_content_bearing_samples(
            self, empty_samples: List[Tuple[int,
                                            Text]]) -> List[Tuple[int, "Doc"]]:
        """Creates empty Doc-objects from zero-lengthed training samples strings."""

        from spacy.tokens import Doc

        n_docs = [(empty_sample[0], doc) for empty_sample, doc in zip(
            empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples])]
        return n_docs
Example #31
0
def test_override_sentiment(EN):
    '''Test new span.sentiment property's default averaging behaviour'''
    good = EN.vocab[u'good']
    good.sentiment = 3.0
    bad = EN.vocab[u'bad']
    bad.sentiment = -2.0

    doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])

    doc.user_span_hooks['sentiment'] = lambda span: 10.0

    good_stuff = doc[:2]
    assert good_stuff.sentiment == 10.0

    bad_stuff = doc[-2:]
    assert bad_stuff.sentiment == 10.0

    good_stuff_bad = doc[:-1]
    assert good_stuff_bad.sentiment == 10.0
Example #32
0
def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = '/tmp/test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
Example #33
0
def read_spacy_docs(spacy_vocab, filename):
    """
    Stream ``spacy.Doc`` s from disk at ``filename`` where they were serialized
    using Spacy's ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize
            the docs in ``filename``
        filename (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        the next deserialized ``spacy.Doc``
    """
    with io.open(filename, mode='rb') as f:
        for bytes_string in SpacyDoc.read_bytes(f):
            yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
Example #34
0
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels

for sentence in doc.sents:
    for token in sentence:
        print token
        print token.orth
        dep_labels = dependency_labels_to_root(token)
        print dep_labels
        for dep_label in dep_labels:
            print nlp.vocab.strings[dep_label]

doc = nlp(u"Mr. Best flew to New York on Saturday morning.")

for ent in doc.ents:
    print ent, ent.label_, ent.orth_
    print ent.root, ent.root.head, ent.root.head.pos, nlp.vocab.strings[ent.root.head.pos], ent.root.head.lemma_

from spacy.tokens.doc import Doc

byte_string = doc.to_bytes()
open('moby_dick.bin', 'wb').write(byte_string)

doc = Doc(nlp.vocab)
for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
    doc.from_bytes(byte_string)
print doc