Python Span.Spanの例、spacy.tokens.Span.Span Pythonの例

コード例 #1

0

ファイルを表示

    def get_extended_token(self, tok):
        '''
        given a token find a more descriptive string extending it with its chindren
        :param tok:
        :param doc:
        :return:
        '''
        allowed_pos = [NOUN, ADJ, PUNCT, PROPN]
        if 'Parkinson' in tok.text:
            pass
        allowed_dep = [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "agent",
            "expl",
            "dobj",
            "attr",
            "oprd",
            "pobj",
            # "conj",
            "compound",
            "amod",
            "meta",
            "npadvmod",
            "nmod",
            "amod"
        ]  # , add "prep" to extend for "of and "in"
        extended_tokens = [
            i for i in tok.subtree
            if (i.dep_ in allowed_dep and i in tok.children) or (i == tok)
        ]
        # just get continous tokens
        span_range = [tok.i, tok.i]
        ext_tokens_i = [i.i for i in extended_tokens]
        max_bound = max(ext_tokens_i)
        min_bound = min(ext_tokens_i)
        curr_pos = tok.i
        for cursor in range(tok.i, max_bound + 1):
            if cursor in ext_tokens_i:
                if cursor == curr_pos + 1:
                    span_range[1] = cursor
                    curr_pos = cursor

        curr_pos = tok.i
        for cursor in range(tok.i, min_bound - 1, -1):
            if cursor in ext_tokens_i:
                if cursor == curr_pos - 1:
                    span_range[0] = cursor
                    curr_pos = cursor
        span = Span(self.doc, span_range[0], span_range[1] + 1)
        return span

コード例 #2

0

ファイルを表示

ファイル: spacy_utils.py プロジェクト: DNLab/fantasy-nlg

 def __call__(self, doc):
     matches = self.matcher(doc)
     spans = []
     for _, start, end in matches:
         entity = Span(doc, start, end, label=self.label)
         spans.append(entity)
         for token in entity:
             token._.set('is_nfl_player', True)
         doc.ents = list(doc.ents) + [entity]
     for span in spans:
         span.merge()
     return doc

コード例 #3

0

ファイルを表示

 def __call__(self, doc: Doc):
     phrases = []
     last_start = 0
     for _, start, end in self._matcher(doc):
         if last_start >= end:
             continue
         last_start = end
         span = Span(doc, start, end)
         phrases.append(span)
     phrases.sort(key=lambda x: (x.start, -len(x)))
     setattr(doc._, self._phrases_name, _fix_overlappings(phrases))
     return doc

コード例 #4

0

ファイルを表示

def _extract_content(tag: Span) -> Span:
    start = None
    end = None
    for token in tag:
        if _is_content(token):
            if start is None:
                start = token.i
            end = token.i + 1
        else:
            break
    assert start is not None and end is not None
    return Span(tag.doc, start, end)

コード例 #5

0

ファイルを表示

def decode_bilou(labels: Sequence[str], tokens: Sequence[Token],
                 doc: Doc) -> List[Span]:
    outside = True
    current_type = None
    start_index = 0
    res = []
    for label, token in zip(labels, tokens):
        if "-" in label:
            tag, entity_type = label.split("-", 2)
        else:
            tag, entity_type = "O", None
        if outside and tag != "O":
            current_type = entity_type
            outside = False
            start_index = token.i
        elif not outside:
            if tag == "O":
                outside = True
                res.append(
                    Span(doc=doc,
                         start=start_index,
                         end=token.i,
                         label=current_type))
                start_index = token.i
            elif tag == "B" or tag == "U" or entity_type != current_type:
                res.append(
                    Span(doc=doc,
                         start=start_index,
                         end=token.i,
                         label=current_type))
                current_type = entity_type
                start_index = token.i
    if not outside:
        outside = True
        res.append(
            Span(doc=doc,
                 start=start_index,
                 end=tokens[-1].i + 1,
                 label=current_type))
    return res

コード例 #6

0

ファイルを表示

    def custom_pipe_component_Name_et_al(self, doc):
        print("entering_custom_pipe_component Name et al")
        new_ents = [] 
        for ent in doc.ents:
            print(f"ent = {ent}")
            # Only check for title if it's a person and not the first token
            replaced = False
        
            if ent.label_ == "PERSON":# and ent.end<len(doc)-2:
                # gib das neue label if et al. is in person or after Person
                if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text):
                    new_ent = Span(doc, ent.start, ent.end, label="REF")
                    replaced = True
                    print("new ents")
                else:
                    # wir schauen ob die danach folgenden et al sind
                    print("within label Person")
                    next_token = doc[ent.end +  1]
                    next_next_token = doc[ent.end + 2]
                    print(next_token.text)
                    print(next_next_token.text)
                    if next_token.text == "et" and next_next_token.text in ("al.", "al"):
                        new_ent = Span(doc, ent.start, ent.end+2, label="REF")
                        new_ents.append(new_ent)
                        replaced = True
                        print("new_ent")


            # es wird das neue angehangen
            if replaced:
                new_ents.append(new_ent)
                print('new ent')
            else:
            # es wird die alte Entitaet uveraendert uebertragen
                new_ents.append(ent)
                print("old ents")
            
        doc.ents = new_ents
        print(doc.ents)
        return doc

コード例 #7

0

ファイルを表示

ファイル: NER.py プロジェクト: pj4239460/Data-Visualization--flask-d3.js-

def tag_placing(doc,tag,string,sent):
    """Add tag to the word "string" in text "sent" """
    
#     time1 = time.time()
    doc.vocab.strings.add(tag)  #add the new tag to the list of pre-existed tags
    TAG = doc.vocab.strings[tag] #convert it to a spacy vocab
    indexes=list(find_all(sent,string))
    if len(indexes)>0:
        st=string.split()
        #Find the indexes of token of the words which have been found
        token_pos=[token.i for token in doc if ((token.text.lower()==st[0] or token.text.lower()==st[0]+"s"))]
        token_pos_real=[]
        numberoftoken=len(list(doc))
        for i in token_pos:
            if(i+len(st)<=numberoftoken):
                spantext=Span(doc, i, i+len(st)).text.lower()
                if(spantext==string or spantext==string+"s"):
                    token_pos_real.append(i)
        tag1=tag #copy f our tag
        for i in token_pos_real:
            tag=tag1
            fb_ent = Span(doc, i, i+len(st)) # create a Span for the new entity
            if (fb_ent in doc.ents): #This part is for word with a lot of tag Signalisation--BF Batiment--BF for example
                j=doc.ents.index(fb_ent)
                ent=doc.ents[j].label_
                if(not(tag in ent)): #pre-existed tag
                    if not(ent in ['PER','ORG','LOC','MISC']):
                        tag=ent+' '+tag
                    doc.vocab.strings.add(tag) 
                    TAG=doc.vocab.strings[tag]
                    fb_ent = Span(doc, i, i+len(st), label=TAG)
                    doc.ents=list(doc.ents)+[fb_ent]#update the NER tag list
                    
            else:
                fb_ent = Span(doc, i, i+len(st), label=TAG)
                doc.ents=list(doc.ents)+[fb_ent] #update the NER tag list
#         time2 = time.time()
#         print(time2-time1)
        return len(token_pos_real)
    return 0

コード例 #8

0

ファイルを表示

def _fix_overlabelings(doc):
    good_labelings = set()
    for span in doc._.labelings:
        should_add_span = False
        for other_span in doc._.labelings:
            # good if spans are identical
            # or they aren't overlapping
            if (
                span.start == other_span.start
                and span.end == other_span.end
                or span.start >= other_span.end
                or span.end <= other_span.start
            ):
                should_add_span = True
                continue
            # exit loop as spans overlap
            # but one is larger
            if (
                span.start > other_span.start
                and span.end <= other_span.end
                or span.start >= other_span.start
                and span.end < other_span.end
            ):
                should_add_span = False
                break
            # merge spans overlapping
            # in a tail-head manner
            # (last label wins)
            if (
                span.start < other_span.start
                and span.end > other_span.start
                and span.end < other_span.end
            ) or (
                span.start > other_span.start
                and span.start < other_span.end
                and span.end > other_span.end
            ):
                if span.start < other_span.start:
                    start = span.start
                    end = other_span.end
                    label = other_span.label
                else:
                    start = other_span.start
                    end = span.end
                    label = span.label
                merge_span = Span(doc, start, end, label)
                good_labelings.add(merge_span)
                should_add_span = False
                break
        if should_add_span:
            good_labelings.add(span)
    doc._.labelings = list(good_labelings)

コード例 #9

0

ファイルを表示

def _merge_abbrs_labelings(doc):
    num_labelings = len(doc._.labelings)
    chunk2label = {s.text: s.label for s in doc._.labelings}
    for abbr in doc._.abbrs:
        # first check if long form is labeled
        # and short one is not
        if (abbr._.long_form.text in chunk2label
                and abbr.text not in chunk2label):
            label = chunk2label[abbr._.long_form.text]
            short_span = Span(doc, abbr.start, abbr.end, label)
            doc._.labelings.append(short_span)
        # otherwise check if short form is labeled
        # and long one is not
        elif (abbr.text in chunk2label
              and abbr._.long_form.text not in chunk2label):
            lf = abbr._.long_form
            label = chunk2label[abbr.text]
            long_span = Span(doc, lf.start, lf.end, label)
            doc._.labelings.append(long_span)
    # sort labelings if there's changes only
    if num_labelings < len(doc._.labelings):
        _sort_labelings(doc)

コード例 #10

0

ファイルを表示

    def test_is_negated(self):
        doc = nlp("There is no evidence of pneumonia.")
        context = ConTextComponent(nlp, add_attrs=True, rules=None)
        rules = [
            ConTextRule("no evidence of",
                        "NEGATED_EXISTENCE",
                        direction="forward")
        ]
        context.add(rules)
        doc.ents = (Span(doc, 5, 6, "CONDITION"), )
        context(doc)

        assert doc.ents[0]._.is_negated is True

コード例 #11

0

ファイルを表示

ファイル: demoYard_pipeline_2.py プロジェクト: stcybrdgs/NLP-Matching

def product_component(doc):
    # apply the matcher to the doc
    matches = matcher(doc)

    # create a span for each match and assign the label 'PRODUCT'
    spans = [
        Span(doc, start, end, label="PRODUCT")
        for match_id, start, end in matches
    ]

    # overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

コード例 #12

0

ファイルを表示

ファイル: 5_spacy_components.py プロジェクト: Krish-Mahajan/spacy-NLP

    def animal_component(self, doc):

        #Apply the matcher to the doc
        matches = self.matcher(doc)

        #create a Span for each match and assign the label 'ANIMAL'
        spans = [
            Span(doc, start, end, label="ANIMAL")
            for match_id, start, end in matches
        ]
        print("Spans are:", spans)
        doc.ents = doc.ents + tuple(spans)
        return doc

コード例 #13

0

ファイルを表示

ファイル: test_doc_api.py プロジェクト: paolodedios/spaCy

def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.has_annotation("ENT_IOB")
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.has_annotation("ENT_IOB")
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.has_annotation("ENT_IOB")
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.has_annotation("ENT_IOB")

コード例 #14

0

ファイルを表示

def test_spans(doc):
    print(doc)
    print(doc.spans)
    for encoding in ["IO", "BIO", "BILUO"]:
        aggregator = aggregation.BaseAggregator("", ["GPE", "NORP", "ORG", "PERSON"], prefixes=encoding)
        obs  = aggregator.get_observation_df(doc)
        print(obs)
        for source in ["name", "org", "place"]:
            spans = utils.token_array_to_spans(obs[source].values, aggregator.out_labels)
            spans = [Span(doc, start, end, label=label) for (start,end),label in spans.items()]
            all_spans = utils.get_spans(doc, [source])
            
            assert spans == all_spans

コード例 #15

0

ファイルを表示

def test_hmm3(doc2, combi_annotator):
    hmm = aggregation.HMM("hmm", ["GPE", "PRODUCT", "MONEY", "PERSON", "ORG", "DATE"])
    hmm.add_underspecified_label("ENT", {"GPE", "PRODUCT", "MONEY", "PERSON", "ORG", "DATE"})
    combi_annotator(doc2)
    hmm.fit([doc2]*100)
    doc2 = hmm(doc2)
    assert len(doc2.spans["hmm"]) > 30
    assert len(doc2.spans["hmm"]) < 45
    assert Span(doc2, 1,2, "GPE") in doc2.spans["hmm"]
    found_entities = {(span.text, span.label_) for span in doc2.spans["hmm"]}
    assert ('Scott Moore', 'PERSON') in found_entities
    assert (('197', 'MONEY') in found_entities or ("$197", "MONEY") in found_entities)
    assert ('iPhone 3Gs', 'PRODUCT') in found_entities

コード例 #16

0

ファイルを表示

ファイル: volume_unit_component.py プロジェクト: zazabar/medaCy

 def __call__(self, doc):
     nlp = self.nlp
     with doc.retokenize() as retokenizer:
         #match and tag volume units
         matches = self.volume_matcher(doc)
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit'])
             for token in span:
                 token._.feature_is_volume_unit = True
             if len(span) > 1:
                 retokenizer.merge(span)
             doc.ents = list(doc.ents) + [span]
     return doc

コード例 #17

0

ファイルを表示

def animal_component(doc):
    # apply the matcher to the doc
    matches = matcher(doc)

    # create a span for each match and assign the label 'ANIMAL'
    spans = [
        Span(doc, start, end, label="ANIMAL")
        for match_id, start, end in matches
    ]

    # overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

コード例 #18

0

ファイルを表示

ファイル: test_utils.py プロジェクト: mtoub/skweak

def test_docbins(nlp_small, temp_file="data/temporary_test.docbin"):
    doc = nlp_small(
        "Pierre Lison is working at the Norwegian Computing Center.")
    doc2 = nlp_small("He is working on various NLP topics.")
    doc.spans["test"] = [Span(doc, 0, 2, label="PERSON")]
    utils.docbin_writer([doc, doc2], temp_file)
    doc3, doc4 = list(utils.docbin_reader(temp_file, "en_core_web_sm"))
    assert doc.text == doc3.text
    assert doc2.text == doc4.text
    assert [(e.text, e.label_) for e in doc.ents] == [(e.text, e.label_)
                                                      for e in doc3.ents]
    assert doc.user_data == doc3.user_data
    os.remove(temp_file)

コード例 #19

0

ファイルを表示

ファイル: test_issue1-1000.py プロジェクト: Fersubair40/chatbot

 def merge_phrases(matcher, doc, i, matches):
     """Merge a phrase. We have to be careful here because we'll change the
     token indices. To avoid problems, merge all the phrases once we're called
     on the last match."""
     if i != len(matches) - 1:
         return None
     spans = [Span(doc, start, end, label=label) for label, start, end in matches]
     with doc.retokenize() as retokenizer:
         for span in spans:
             tag = "NNP" if span.label_ else span.root.tag_
             attrs = {"tag": tag, "lemma": span.text}
             retokenizer.merge(span, attrs=attrs)
             doc.ents = doc.ents + (span,)

コード例 #20

0

ファイルを表示

 def __call__(self, doc):
     nlp = self.nlp
     with doc.retokenize() as retokenizer:
         # match units of measurement (x/y, , etc)
         matches = self.unit_of_measurement_matcher(doc)
         for match_id, start, end in matches:
             span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit'])
             for token in span:
                 token._.feature_is_measurement_unit = True
             if len(span) > 1:
                 retokenizer.merge(span)
             doc.ents = list(doc.ents) + [span]
     return doc

コード例 #21

0

ファイルを表示

ファイル: relation_matcher.py プロジェクト: codacy-badger/Relex

 def __call__(self, doc):
     relations = []
     for match_id, start, end in self.matcher(doc):
         match = doc[start:end]
         for tok in match:
             tok.ent_type_ = self.label
         relations.append(tok)
         span = Span(doc, start, end, label=match_id)
         if span not in doc.ents:
             doc.ents = list(doc.ents) + [span]
     doc.user_data[self.label] = relations
     print("tokens")
     return doc

コード例 #22

0

ファイルを表示

def animal_component(doc, vocab):
    animals = [
        "Golden Retriever", "cat", "turtle", "dog", "fish",
        "Rattus norvegicus", "snake", "lion", "tigger"
    ]
    matcher = PhraseMatcher(vocab)
    mathes = matcher(doc)
    span = [
        Span(doc, start, end, label="ANIMAL")
        for matcher_id, start, end in mathes
    ]
    doc.ents = span
    return doc

コード例 #23

0

ファイルを表示

    def _add_ann(self, cui, doc, tkns, acc=-1, name=None, is_disamb=False):
        """ Add annotation to a document

        cui:  concept id
        doc:  spacy document where the concept was found
        tkns:  tokens for this cui
        acc:  accuracy for this annotation
        name:  concept name
        """
        # Skip if tui filter
        if self.TUI_FILTER is None or self.cdb.cui2tui[cui] in self.TUI_FILTER:
            if not is_disamb and cui in self.cdb.cui_disamb_always:
                self.to_disamb.append((list(tkns), name))
            else:
                if self.LBL_STYLE == 'long':
                    lbl = "{} - {} - {} - {} - {:.2}".format(
                        cui, self.cdb.cui2pretty_name.get(cui, ''),
                        self.cdb.cui2tui.get(cui, ''),
                        self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''),
                                              ''), float(acc))
                elif self.LBL_STYLE == 'ent':
                    lbl = "{} - {:.2}".format(
                        self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''),
                                              ''), float(acc))
                else:
                    lbl = cui

                lbl = doc.vocab.strings.add(lbl)
                ent = Span(doc, tkns[0].i, tkns[-1].i + 1, label=lbl)

                if self.ACC_ALWAYS:
                    acc = self._calc_acc(cui, doc, tkns, name)

                ent._.acc = acc
                ent._.cui = cui
                ent._.tui = self.cdb.cui2tui.get(cui, 'None')
                ent._.id = self.ent_id
                self.ent_id += 1
                doc._.ents.append(ent)

                # Increase counter for cui_count_ext if not already added
                if cui not in self._cuis:
                    if cui in self.cdb.cui_count_ext:
                        self.cdb.cui_count_ext[cui] += 1
                    else:
                        self.cdb.cui_count_ext[cui] = 1

                if self.train or self.force_train:
                    self._add_cntx_vec(cui, doc, tkns)

                self._cuis.add(cui)

コード例 #24

0

ファイルを表示

def Serialization():
    print("\nThe outcomes of Serialization are:")
    try:
        text = open("/home/wangdi498/SpaCy/diary2.txt", 'r').read(
        )  # 'r'会按编码格式进行解析，read()返回的是str；'rb'：会按二进制进行解析，read()返回的是bytes。
        print("\nInfo: The Serialization file can be read.\n")
    except FileNotFoundError:
        print("\nError! The Serialization file cannot be read!\n")
        sys.exit(0)  # os._exit()会直接将python程序终止，之后的所有代码都不会继续执行。
    except:
        print("\nError! The .txt file must be UTF-8 encoded format!\n")
    doc = nlp(text)
    doc.to_disk("/home/wangdi498/SpaCy/diary1.bin")

    from spacy.tokens import Doc
    from spacy.vocab import Vocab
    doc = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary1.bin")
    print("The texts are:\n{}".format(doc))

    from spacy.tokens import Span
    doc = nlp(text)
    print("\nThe 1st round of Save and Load is:")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
    assert len(
        doc.ents) != 0, "\nError! This document cannot be empty!"  # 防止Doc为空。
    augment = [Span(doc, 0, 2, label=doc.vocab.strings[u'EVENT'])]
    doc.ents = list(doc.ents) + augment
    doc.to_disk("/home/wangdi498/SpaCy/diary2.bin")
    print("\nThe 2nd round of Save and Load is:")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))

    paragraph = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary2.bin")
    assert len(paragraph.ents
               ) != 0, "\nError! This document cannot be empty!"  # 防止Doc为空。
    print("\nThe 3rd round of Save and Load is:")
    for ent in paragraph.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
    assert [(ent.text, ent.label_) for ent in paragraph.ents] != [
        (u'2018年9月27日', u'EVENT')
    ], "\nHere! The entity '%s' has matched the specified one." % ent.text

コード例 #25

0

ファイルを表示

ファイル: test_util.py プロジェクト: zxlzr/spikex

def test_span_idx2i(nlp, start_idx, end_idx, start_i, end_i, slice_at):
    doc = nlp(_TEXT_SAMPLE)
    doc_bounds = span_idx2i(doc, start_idx, end_idx)
    assert doc_bounds == (start_i, end_i)
    offset_idx = len(doc[:slice_at].text_with_ws)
    fix_start_idx = start_idx - offset_idx
    fix_end_idx = end_idx - offset_idx
    span = Span(doc, slice_at, len(doc))
    span_bounds = span_idx2i(span, fix_start_idx, fix_end_idx)
    tokens = doc[slice_at:]
    tokens_bounds = span_idx2i(tokens, fix_start_idx, fix_end_idx)
    fix_bounds = (start_i - slice_at, end_i - slice_at)
    assert span_bounds == fix_bounds
    assert tokens_bounds == fix_bounds

コード例 #26

0

ファイルを表示

ファイル: test_sectionizer.py プロジェクト: tnguyen-cohere/medspacy

 def test_context_attributes(self):
     sectionizer = Sectionizer(
         nlp,
         rules=None,
         add_attrs={"past_medical_history": {
             "is_negated": True
         }})
     sectionizer.add(
         [SectionRule("Past Medical History:", "past_medical_history")])
     doc = nlp("Past Medical History: Pneumonia")
     from spacy.tokens import Span
     doc.ents = (Span(doc, 4, 5), )
     sectionizer(doc)
     assert doc.ents[0]._.is_negated is True

コード例 #27

0

ファイルを表示

def test_displacy_render_wrapper(en_vocab):
    """Test that displaCy accepts custom rendering wrapper."""

    def wrapper(html):
        return "TEST" + html + "TEST"

    displacy.set_render_wrapper(wrapper)
    doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc, style="ent")
    assert html.startswith("TEST<div")
    assert html.endswith("/div>TEST")
    # Restore
    displacy.set_render_wrapper(lambda html: html)

コード例 #28

0

ファイルを表示

def bunsetu_spans(doc):
  if type(doc)==Doc:
    b=[i for i,j in enumerate(doc.user_data["bunsetu_bi_labels"]) if j=="B"]
    b.append(len(doc))
    return [Span(doc,i,j) for i,j in zip(b,b[1:])]
  elif type(doc)==Span:
    b=doc[0].doc.user_data["bunsetu_bi_labels"]
    s=[bunsetu_span(doc[0])] if b[doc[0].i]=="I" else []
    for t in doc:
      if b[t.i]=="B":
        s.append(bunsetu_span(t))
    return s
  elif type(doc)==Token:
    return [bunsetu_span(doc)]

コード例 #29

0

ファイルを表示

ファイル: test_functions.py プロジェクト: xettrisomeman/spaCy

def doc2(en_vocab):
    words = ["I", "like", "New", "York", "in", "Autumn", "."]
    heads = [1, 1, 3, 1, 1, 4, 1]
    tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
    pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
    deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
    doc = Doc(en_vocab,
              words=words,
              heads=heads,
              tags=tags,
              pos=pos,
              deps=deps)
    doc.ents = [Span(doc, 2, 4, label="GPE")]
    return doc

コード例 #30

0

ファイルを表示

ファイル: SpaCy_sucks.py プロジェクト: wangdi917/Testing

def Entity():
    print("\nThe outcomes of Entity Extraction are:")
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u"Apple isn't looking at buying U.S.A. startup for $1 billion.")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}".format(ent.text, ent.start_char,
                                          ent.end_char, ent.label_))

    from spacy.tokens import Span
    doc = nlp(u"FB is hiring a new VP of global policy.")
    doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}".format(ent.text, ent.start_char,
                                          ent.end_char, ent.label_))