Example #1
0
def test_match_zero(matcher):
    matcher.add('Quote', '', {}, [[{
        'ORTH': '"'
    }, {
        'OP': '!',
        'IS_PUNCT': True
    }, {
        'OP': '!',
        'IS_PUNCT': True
    }, {
        'ORTH': '"'
    }]])
    doc = Doc(matcher.vocab, words='He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
    doc = Doc(matcher.vocab,
              words='He said , " some three words " ...'.split())
    assert len(matcher(doc)) == 0
    matcher.add('Quote', '', {}, [[{
        'ORTH': '"'
    }, {
        'IS_PUNCT': True
    }, {
        'IS_PUNCT': True
    }, {
        'IS_PUNCT': True
    }, {
        'ORTH': '"'
    }]])
    assert len(matcher(doc)) == 0
def deserialize_dataset(file_path, max_items):
    vocab = spacy.load('en_default').vocab

    with open(file_path, 'rb') as input_file:
        for data_point in pickle.load(input_file)[:max_items]:
            yield {
                'question1': Doc(vocab).from_bytes(data_point['question1']),
                'question2': Doc(vocab).from_bytes(data_point['question2']),
                'id': data_point['id'],
                'is_duplicate': data_point.get('is_duplicate', None)
            }
Example #3
0
def get_tokens(model: Language, doc_id: int):
    fn = os.path.join(settings.TOKEN_DIR, str(doc_id))
    if not os.path.exists(fn):
        raise ValueError(
            "Document {doc_id} has not been preprocessed ({fn} does not exist)"
            .format(**locals()))
    return Doc(model.vocab).from_disk(fn)
Example #4
0
    def decode(self, output: TaskOutput) -> TaskOutput:
        # The dims are: batch, top_k, tags
        output.tags: List[List[List[str]]] = [
            self._decode_tags(paths) for paths in output.viterbi_paths
        ]
        output.scores: List[List[float]] = [[score for tags, score in paths]
                                            for paths in output.viterbi_paths]

        output.entities: List[List[List[Dict]]] = []
        output.tokens: List[List[Dict]] = []
        # iterate over batch
        for raw_text, k_tags in zip(output.raw_text, output.tags):
            pre_tokenized = not isinstance(raw_text, str)
            if pre_tokenized:
                # compose spacy doc from tokens
                doc = Doc(Vocab(), words=raw_text)
            else:
                doc = self.backbone.tokenizer.nlp(raw_text)

            output.entities.append(
                self._decode_entities(doc, k_tags, pre_tokenized))
            output.tokens.append(
                self._decode_tokens(doc) if not pre_tokenized else None)

        if not any(output.tokens):  # drop tokens field if no data
            del output.tokens

        del output.logits
        del output.mask
        del output.probs
        del output.raw_text
        del output.viterbi_paths

        return output
Example #5
0
	def get_spacy(self,load_from_file=False,model_name='en_core_web_sm'):
		import spacy
		global nlp
		if not nlp:
			#print('>> loading spacy...')
			nlp = spacy.load(model_name)

		doc=None
		if self.parsed and load_from_file:
			#print self.fnfn_spacy
			from spacy.tokens.doc import Doc

			try:
				for byte_string in Doc.read_bytes(open(self.fnfn_spacy, 'rb')):
					doc = Doc(nlp.vocab)
					doc.from_bytes(byte_string)
			except UnicodeDecodeError:
				print("!! UNICODE ERROR:",self.fnfn_spacy)
		#else:

		if not doc:
			#print '>> making spacy document for text',self.id
			txt=self.text
			txt=clean_text(txt)
			doc=nlp(txt)

		return doc
Example #6
0
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
    doc = Doc(en_parser.vocab, words=text)
    assert len(doc) == length
    with en_parser.step_through(doc) as _:  # noqa: F841
        pass
    assert doc[0].is_space
    for token in doc:
        assert token.head.i == length - 1
Example #7
0
def test_get_entity_via_match(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add_entity('TestEntity', attrs={u'Hello': u'World'})
    assert matcher.n_patterns == 0
    assert matcher(Doc(en_vocab, words=[u'Test', u'Entity'])) == []
    matcher.add_pattern(u'TestEntity', [{ORTH: u'Test'}, {ORTH: u'Entity'}])
    assert matcher.n_patterns == 1
    matches = matcher(Doc(en_vocab, words=[u'Test', u'Entity']))
    assert len(matches) == 1
    assert len(matches[0]) == 4
    ent_id, label, start, end = matches[0]
    assert ent_id == matcher.vocab.strings[u'TestEntity']
    assert label == 0
    assert start == 0
    assert end == 2
    attrs = matcher.get_entity(ent_id)
    assert attrs == {u'Hello': u'World'}
Example #8
0
def read_docs(filepath):
    """Deserialize a list of documents + associated metadata"""
    spacy_parser = get_spacy_parser()
    data = pickle.load(open(filepath, 'rb'))
    for row in data:
        doc = Doc(spacy_parser.vocab)
        # read doc object from serialized byte array
        row['content'] = doc.from_bytes(row.pop('binary_content'))
    return data
Example #9
0
def test_match_zero_plus(matcher):
    matcher.add('Quote', '', {}, [
        [
            {'ORTH': '"'},
            {'OP': '*', 'IS_PUNCT': False},
            {'ORTH': '"'}
        ]])
    doc = Doc(matcher.vocab, 'He said , " some words " ...'.split())
    assert len(matcher(doc)) == 1
Example #10
0
 def forward(texts, drop=0.):
     if tokenized:
         docs = [Doc(nlp.vocab, words) for words in texts]
     else:
         docs = [nlp(text) for text in texts]
     features = [doc.to_array(attrs) for doc in docs]
     def backward(d_features, sgd=None):
         return d_features
     return features, backward
Example #11
0
    def process_non_content_bearing_samples(
            self, empty_samples: List[Tuple[int,
                                            Text]]) -> List[Tuple[int, "Doc"]]:
        """Creates empty Doc-objects from zero-lengthed training samples strings."""

        from spacy.tokens import Doc

        n_docs = [(empty_sample[0], doc) for empty_sample, doc in zip(
            empty_samples, [Doc(self.nlp.vocab) for doc in empty_samples])]
        return n_docs
Example #12
0
def test_efficient_binary_serialization(doc):
    from spacy.tokens.doc import Doc

    byte_string = doc.to_bytes()
    open('moby_dick.bin', 'wb').write(byte_string)

    nlp = spacy.en.English()
    for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):
       doc = Doc(nlp.vocab)
       doc.from_bytes(byte_string)
Example #13
0
def test_read_bytes(nlp):
    from spacy.tokens.doc import Doc
    loc = 'test_serialize.bin'
    with open(loc, 'wb') as file_:
        file_.write(nlp(u'This is a document.').to_bytes())
        file_.write(nlp(u'This is another.').to_bytes())
    docs = []
    with open(loc, 'rb') as file_:
        for byte_string in Doc.read_bytes(file_):
            docs.append(Doc(nlp.vocab).from_bytes(byte_string))
    assert len(docs) == 2
 def __call__(self, text):
     if self._word_tokenizer:
         words = self._word_tokenizer.tokenize(text)
     else:
         words = text.split(' ')
     if self.return_doc:
         words = self._remove_empty_words(words)
         spaces = [True] * len(words)
         return Doc(self.vocab, words=words, spaces=spaces)
     else:
         return words
Example #15
0
def load_and_transform(batch_id, in_loc, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as out_file:
        with io.open(in_loc, 'rb') as in_file:
            for byte_string in Doc.read_bytes(in_file):
                doc = Doc(nlp.vocab).from_bytes(byte_string)
                doc.is_parsed = True
                out_file.write(transform_doc(doc)) 
Example #16
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     doc = Doc(spacy_model.vocab,
               words=json_dict['sentence'],
               spaces=[True for _ in range(len(json_dict['sentence']))])
     spacy_tokens = spacy_model.pipeline[0][1](doc)
     if self._dataset_reader.use_language_specific_pos:  # type: ignore
         # fine-grained part of speech
         pos_tags = [token.tag_ for token in spacy_tokens]
     else:
         # coarse-grained part of speech (Universal Depdendencies format)
         pos_tags = [token.pos_ for token in spacy_tokens]
     return self._dataset_reader.text_to_instance(json_dict['sentence'], pos_tags)
Example #17
0
def parse(sntnc, nlp, tokenized=False):
    # if it's tokenized, split the string by space and convert to a list for input
    doc = None

    if tokenized:
        doc = Doc(nlp.vocab, words=sntnc.split(" "))
        nlp.tagger(doc)
        nlp.parser(doc)
        nlp.entity(doc)
    else:
        doc = nlp(sntnc)

    return doc
Example #18
0
def test_graph_edges_and_nodes():
    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
    graph = Graph(doc, name="hello")
    node1 = graph.add_node((0, ))
    assert graph.get_node((0, )) == node1
    node2 = graph.add_node((1, 3))
    assert list(node2) == [1, 3]
    graph.add_edge(node1, node2, label="one", weight=-10.5)
    assert graph.has_edge(node1, node2, label="one")
    assert node1.heads() == []
    assert [tuple(h) for h in node2.heads()] == [(0, )]
    assert [tuple(t) for t in node1.tails()] == [(1, 3)]
    assert [tuple(t) for t in node2.tails()] == []
Example #19
0
    def load(cls, nlp: Language, path: Union[str, Path]) -> Corpus:
        index = []
        docs = []
        with zipfile.ZipFile(path, 'r',
                             compression=zipfile.ZIP_DEFLATED) as archive:
            for fname in archive.namelist():
                i, _ = os.path.splitext(fname)
                index.append(i)
                with archive.open(fname) as f:
                    doc = Doc(nlp.vocab).from_bytes(f.read())
                    docs.append(doc)
        indexed_documents = zip(docs, index)

        return cls(nlp, indexed_documents=indexed_documents)
Example #20
0
 def __iter__(self, week=None):
     with open(self.path + ".info")  as info:
         with open(self.path + ".title.bin") as title_bin:
             for byte_string in Doc.read_bytes(title_bin):
                 info_line = info.readline()
                 comment_info = self._parse_info(info_line)
                 if not (week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 comment_info["doc"] = Doc(self._vocab).from_bytes(byte_string)
                 yield comment_info
Example #21
0
def make_docs(nlp, batch, heads=True):
    docs = []
    for record in batch:
        text = record["text"]
        if "tokens" in record:
            doc = Doc(nlp.vocab, words=record["tokens"])
        else:
            doc = nlp.make_doc(text)
        if "heads" in record:
            heads = record["heads"]
            heads = numpy.asarray(heads, dtype="uint64")
            heads = heads.reshape((len(doc), 1))
            doc = doc.from_array([HEAD], heads)
        if len(doc) >= 1 and len(doc) < 200:
            docs.append(doc)
    return docs
Example #22
0
 def __iter__(self, week=None):
     with open(self.path + ".bin", "rb") as bin:
         with open(self.path + ".info")  as info:
             for byte_string in Doc.read_bytes(bin):
                 comment_info = self._parse_info(info.next())
                 if (not week is None) and get_week(comment_info["timestamp"]) != week:
                     continue
                 if self.clean_deleted and comment_info["author"] == "[deleted]":
                     continue
                 if self.clean_bots and (is_bot(comment_info["author"]) or 
                     comment_info["author"] in FILTERED_USERS):
                     continue
                 doc = Doc(self._vocab).from_bytes(byte_string)
                 comment_info["doc"] = doc
                 comment_info["text"] = self._text_from_doc(doc)
                 yield comment_info
Example #23
0
def test_default_sentiment(EN):
    '''Test new span.sentiment property's default averaging behaviour'''
    good = EN.vocab[u'good']
    good.sentiment = 3.0
    bad = EN.vocab[u'bad']
    bad.sentiment = -2.0

    doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])

    good_stuff = doc[:2]
    assert good_stuff.sentiment == 3.0 / 2

    bad_stuff = doc[-2:]
    assert bad_stuff.sentiment == -2. / 2

    good_stuff_bad = doc[:-1]
    assert good_stuff_bad.sentiment == (3. + -2) / 3.
Example #24
0
def test_override_sentiment(EN):
    '''Test new span.sentiment property's default averaging behaviour'''
    good = EN.vocab[u'good']
    good.sentiment = 3.0
    bad = EN.vocab[u'bad']
    bad.sentiment = -2.0

    doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])

    doc.user_span_hooks['sentiment'] = lambda span: 10.0

    good_stuff = doc[:2]
    assert good_stuff.sentiment == 10.0

    bad_stuff = doc[-2:]
    assert bad_stuff.sentiment == 10.0

    good_stuff_bad = doc[:-1]
    assert good_stuff_bad.sentiment == 10.0
Example #25
0
def test_issue7056():
    """Test that the Unshift transition works properly, and doesn't cause
    sentence segmentation errors."""
    vocab = Vocab()
    ae = ArcEager(
        vocab.strings,
        ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]))
    doc = Doc(vocab, words="Severe pain , after trauma".split())
    state = ae.init_batch([doc])[0]
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "L-amod")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "R-pobj")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    assert not state.eol()
Example #26
0
def test_graph_walk():
    doc = Doc(Vocab(), words=["a", "b", "c", "d"])
    graph = Graph(
        doc,
        name="hello",
        nodes=[(0, ), (1, ), (2, ), (3, )],
        edges=[(0, 1), (0, 2), (0, 3), (3, 0)],
        labels=None,
        weights=None,
    )
    node0, node1, node2, node3 = list(graph.nodes)
    assert [tuple(h) for h in node0.heads()] == [(3, )]
    assert [tuple(h) for h in node1.heads()] == [(0, )]
    assert [tuple(h) for h in node0.walk_heads()] == [(3, ), (0, )]
    assert [tuple(h) for h in node1.walk_heads()] == [(0, ), (3, ), (0, )]
    assert [tuple(h) for h in node2.walk_heads()] == [(0, ), (3, ), (0, )]
    assert [tuple(h) for h in node3.walk_heads()] == [(0, ), (3, )]
    assert [tuple(t) for t in node0.walk_tails()] == [(1, ), (2, ), (3, ),
                                                      (0, )]
    assert [tuple(t) for t in node1.walk_tails()] == []
    assert [tuple(t) for t in node2.walk_tails()] == []
    assert [tuple(t) for t in node3.walk_tails()] == [(0, ), (1, ), (2, ),
                                                      (3, )]
    def _make_task_prediction(
        self,
        single_forward_output: Dict,
        instance: Instance,
    ) -> TokenClassificationPrediction:
        # The dims are: top_k, tags
        tags: List[List[str]] = self._make_tags(
            single_forward_output["viterbi_paths"])
        # construct a spacy Doc
        pre_tokenized = not isinstance(single_forward_output["raw_text"], str)
        if pre_tokenized:
            # compose doc from tokens
            doc = Doc(Vocab(), words=single_forward_output["raw_text"])
        else:
            doc = self.backbone.tokenizer.nlp(
                single_forward_output["raw_text"])

        return TokenClassificationPrediction(
            tags=tags,
            scores=[
                score for tags, score in single_forward_output["viterbi_paths"]
            ],
            entities=self._make_entities(doc, tags, pre_tokenized),
        )
 def __call__(self, tokens):
     spaces = [True] * len(tokens)
     return Doc(self.vocab, words=tokens, spaces=spaces)
Example #29
0
def main():

    _arg = parse_args()
    # list of articles from which line list features are to be extracted
    # for each infected case
    ll_articles = [json.loads(l) for l in io.open(_arg.MERSbulletins, "r")]
    # word embeddings specific to WHO corpus extracted by word2vec models (SGNS
    # or SGHS)
    w2v_model = Word2Vec.load(_arg.whovec)
    # Number of predictor keywords (excluding the seed keyword)
    K = np.int(_arg.numind)
    # Seed keywords for each line list feature guiding the extraction process
    seed_keywords = {
        "Onset Date": "onset",
        "Hospital Date": "hospitalized",
        "Outcome Date": "died",
        "Specified Proximity to Animals or Animal Products": "animals",
        "Specified Contact with Other Cases": "case",
        "Specified HCW": "healthcare",
        "Specified Comorbidities": "comorbidities"
    }
    auto_ll = []
    ll_extract = LineList()
    # Extracting the number of infected cases and the line list features
    # corresponding to each case from each article
    for ll_artl in ll_articles:
        ll_text = ""
        dt_offsets = []
        dt_dict = defaultdict()
        # dt_dict: mapping date phrases to proper datetime strings, e.g.
        for dtphrase_elm in ll_artl['eventSemantics']['datetimes']:
            dt_dict["-".join(
                dtphrase_elm['phrase'].split())] = dtphrase_elm["date"]
            dt_offsets.append({
                'start':
                ll_artl['BasisEnrichment']['tokens'][int(
                    dtphrase_elm['offset'].split(":")[0])]['start'],
                'end':
                ll_artl['BasisEnrichment']['tokens'][
                    int(dtphrase_elm['offset'].split(":")[1]) - 1]['end']
            })
        for i in xrange(len(ll_artl["content"])):
            is_offset = 0
            for offset_elm in dt_offsets:
                if int(offset_elm['start']) <= i < int(offset_elm['end']):
                    is_offset = 1
                    if ll_artl["content"][i] == " ":
                        ll_text += "-"
                    else:
                        ll_text += ll_artl["content"][i]
            if not is_offset:
                if ll_artl["content"][i] == ".":
                    ll_text += " "
                ll_text += ll_artl["content"][i]
        num_cases = []
        en_nlp = spacy.load('en')
        ll_doc = Doc(en_nlp.vocab)
        ll_doc = en_nlp(ll_text)
        ll_sents = []
        for sent in ll_doc.sents:
            ll_sents.append(sent)
        # Extracting the number of cases mentioned in the article
        # using age and gender information
        for sent_ind in xrange(len(ll_sents)):
            ag_out = ll_extract.get_age_gender(ll_sents[sent_ind].text)
            if ag_out['age'] is not None and ag_out['gender'] is not None:
                case_feature = defaultdict()
                case_feature['age'] = ag_out['age']
                case_feature['gender'] = ag_out['gender']
                case_feature['start'] = sent_ind
                case_feature['link'] = ll_artl["link"]
                num_cases.append(case_feature)
        # Identifying the start sentence and end sentence for each case
        for case_ind in xrange(len(num_cases)):
            sent_start = num_cases[case_ind]['start']
            sent_end = sent_start + 2
            for ind_case in xrange(len(num_cases)):
                if ind_case == case_ind:
                    continue
                if num_cases[ind_case]['start'] > sent_start:
                    sent_end = num_cases[ind_case]['start'] - 1
                    break
            try:
                for ll_feat in seed_keywords:
                    num_cases[case_ind][ll_feat] = defaultdict()
                # Extracting the disease onset features for each case
                for dt_feat in ["Onset Date", "Hospital Date", "Outcome Date"]:
                    kwargs = {
                        'K': K,
                        'w2v': w2v_model,
                        'start': sent_start,
                        'end': sent_end,
                        'll_sents': ll_sents,
                        'seed': seed_keywords[dt_feat],
                        'dt_dict': dt_dict
                    }
                    num_cases[case_ind][dt_feat] = ll_extract.infer_date(
                        **kwargs)['final']
                # Extracting the clinical features corresponding to each case
                for clin_feat in [
                        "Specified Proximity to Animals or Animal Products",
                        "Specified Contact with Other Cases", "Specified HCW",
                        "Specified Comorbidities"
                ]:
                    kwargs = {
                        'K': K,
                        'w2v': w2v_model,
                        'start': sent_start,
                        'end': sent_end,
                        'll_sents': ll_sents,
                        'seed': seed_keywords[clin_feat]
                    }
                    num_cases[case_ind][clin_feat] = ll_extract.infer_clinical(
                        **kwargs)['final']
            except Exception as e:
                print e
        if len(num_cases) != 0:
            auto_ll.extend(num_cases)
    # Writing the automatically extracted line lists to a file
    if len(auto_ll) != 0:
        with open(_arg.outputll, "w") as f_ll:
            for cs in auto_ll:
                print >> f_ll, json.dumps(cs, encoding='utf-8')
Example #30
0
def apple_orange(en_vocab):
    return Doc(en_vocab, words=[u'apple', u'orange'])