Example #1
0
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    # Flatten the conll annotations, and adjust the head indices
    flat = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
        flat["heads"].extend(
            len(flat["words"]) + head for head in sent["heads"])
        for field in [
                "words", "tags", "deps", "morphology", "entities", "spaces"
        ]:
            flat[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
    assert len(flat["words"]) == len(flat["spaces"])
    if text is None:
        text = "".join(word + " " * space
                       for word, space in zip(flat["words"], flat["spaces"]))
    doc = nlp.make_doc(text)
    flat.pop("spaces")
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
        if random.random() < drop_deps:
            gold.heads[i] = None
            gold.labels[i] = None

    return doc, gold
Example #2
0
def Train():
    print("\nThe outcomes of Training and Updating are:")
    from spacy.tokens import Doc
    from spacy.vocab import Vocab
    from spacy.gold import GoldParse
    vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
    doc = Doc(vocab, words=['用户', '体验', 'APP'])
    gold = GoldParse(doc, tags=['N', 'V', 'N'])
    doc = Doc(Vocab(), words=['陆金所', '成立', 'AI实验室', '已经', '一年'])
    gold = GoldParse(doc,
                     entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE'])
    doc = Doc(nlp.vocab,
              words=[u'刘强东', u'章泽天', u'大学生', u'遇见'],
              spaces=[False, False, False, False])
    gold = GoldParse(doc, entities=[u'PERSON', u'PERSON', u'PRODUCT', u'O'])

    train_data = convert_JSON_python('/home/wangdi498/SpaCy/NER_example2.json')
    with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names
                             if pipe != 'ner']):
        optimizer = nlp.begin_training()
        for i in range(10):
            random.shuffle(train_data)
            # 每轮都会shuffle训练数据,保证模型不会根据训练顺序来做generalizations。也可以设置dropout rate让模型以一定几率放弃一些features和representations来避免模型过牢地记住训练数据。
            for text, annotations in train_data:
                # doc = nlp.make_doc(text)
                # gold = GoldParse(doc, entities=entity_offsets)
                # nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
                nlp.update([text], [annotations], sgd=optimizer)  # 用得到的数据更新模型。
    nlp.to_disk("/home/wangdi498/SpaCy/models")
Example #3
0
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    # Flatten the conll annotations, and adjust the head indices
    flat = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
        for field in ["words", "tags", "deps", "entities", "spaces"]:
            flat[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
    assert len(flat["words"]) == len(flat["spaces"])
    if text is None:
        text = "".join(
            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
        )
    doc = nlp.make_doc(text)
    flat.pop("spaces")
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
        if random.random() < drop_deps:
            gold.heads[i] = None
            gold.labels[i] = None

    return doc, gold
Example #4
0
def train_ner(_nlp, train_data, iterations, learn_rate=1e-3, dropout=0., tags_complete=True):
    """
    Train spacy entity recogniser (either the new on or update existing _nlp.entity)
    :param _nlp: spacy.lang.Language class, containing EntityRecogniser which is to be trained
    :param train_data: dataset in spacy format for training
    :param iterations: num of full iterations through the dataset
    :param learn_rate:
    :param dropout:
    :param tags_complete: if True, then assume that provided entity tags are complete
    :return:
    """
    _nlp.entity.model.learn_rate = learn_rate
    for itn in range(1, iterations+1):
        random.shuffle(train_data)
        loss = 0.
        for old_doc, entity_offsets in train_data:
            doc = _nlp.make_doc(old_doc.text)  # it is needed despite that the data is already preprocessed (by _nlp() call)
            gold = GoldParse(doc, entities=entity_offsets)

            # By default, the GoldParse class assumes that the entities
            # described by offset are complete, and all other words should
            # have the tag 'O'. You can tell it to make no assumptions
            # about the tag of a word by giving it the tag '-'.
            if not tags_complete:
                for i in range(len(gold.ner)):
                    if gold.ner[i] == 'O':
                        gold.ner[i] = '-'

            _nlp.tagger(doc)  # todo: why is that? is it needed for updating existing? is it needed for new model?

            loss += _nlp.entity.update(doc, gold, drop=dropout)
        log.info('train_ner: iter #{}/{}, loss: {}'.format(itn, iterations, loss))
        if loss == 0:
            break
Example #5
0
def test_get_oracle_moves_negative_O(tsys, vocab):
    doc = Doc(vocab, words=["A", "B", "C", "D"])
    gold = GoldParse(doc, entities=[])
    gold.ner = ["O", "!O", "O", "!O"]
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
Example #6
0
def test_get_oracle_moves_negative_O(tsys, vocab):
    doc = Doc(vocab, words=["A", "B", "C", "D"])
    gold = GoldParse(doc, entities=[])
    gold.ner = ["O", "!O", "O", "!O"]
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
Example #7
0
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
    gold = GoldParse(doc, entities=entity_annots)
    for i, tag in enumerate(gold.ner):
        if tag == "L-!GPE":
            gold.ner[i] = "-"
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
Example #8
0
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
    entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
    gold = GoldParse(doc, entities=entity_annots)
    for i, tag in enumerate(gold.ner):
        if tag == "L-!GPE":
            gold.ner[i] = "-"
    tsys.preprocess_gold(gold)
    act_classes = tsys.get_oracle_sequence(doc, gold)
    names = [tsys.get_class_name(act) for act in act_classes]
    assert names
def test_ner_update_batch(ner, nlp):
    doc1 = nlp("Hello world. This is sentence 2.")
    doc2 = nlp("Hi again. This is sentence 4.")
    ents1 = ["O"] * len(doc1)
    ents2 = ["O"] * len(doc2)
    assert len(list(doc1.sents)) == 2
    assert len(list(doc2.sents)) == 2
    optimizer = nlp.resume_training()
    golds = [GoldParse(doc1, entities=ents1), GoldParse(doc2, entities=ents2)]
    losses = {}
    ner.update([doc1, doc2], golds, sgd=optimizer, losses=losses)
    assert PIPES.ner in losses
def test_textcat_update_batch(textcat, nlp):
    doc1 = nlp("Hello world. This is sentence 2.")
    doc2 = nlp("Hi again. This is sentence 4.")
    assert len(list(doc1.sents)) == 2
    assert len(list(doc2.sents)) == 2
    optimizer = nlp.resume_training()
    golds = [
        GoldParse(doc1, cats={"Hello": 1.0}),
        GoldParse(doc2, cats={"Hello": 0.0})
    ]
    losses = {}
    textcat.update([doc1, doc2], golds, sgd=optimizer, losses=losses)
    assert "pytt_textcat" in losses
Example #11
0
def test_ner_per_type(en_vocab):
    # Gold and Doc are identical
    scorer = Scorer()
    for input_, annot in test_ner_cardinal:
        doc = get_doc(
            en_vocab,
            words=input_.split(" "),
            ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
        )
        gold = GoldParse(doc, entities=annot["entities"])
        scorer.score(doc, gold)
    results = scorer.scores

    assert results["ents_p"] == 100
    assert results["ents_f"] == 100
    assert results["ents_r"] == 100
    assert results["ents_per_type"]["CARDINAL"]["p"] == 100
    assert results["ents_per_type"]["CARDINAL"]["f"] == 100
    assert results["ents_per_type"]["CARDINAL"]["r"] == 100

    # Doc has one missing and one extra entity
    # Entity type MONEY is not present in Doc
    scorer = Scorer()
    for input_, annot in test_ner_apple:
        doc = get_doc(
            en_vocab,
            words=input_.split(" "),
            ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
        )
        gold = GoldParse(doc, entities=annot["entities"])
        scorer.score(doc, gold)
    results = scorer.scores

    assert results["ents_p"] == approx(66.66666)
    assert results["ents_r"] == approx(66.66666)
    assert results["ents_f"] == approx(66.66666)
    assert "GPE" in results["ents_per_type"]
    assert "MONEY" in results["ents_per_type"]
    assert "ORG" in results["ents_per_type"]
    assert results["ents_per_type"]["GPE"]["p"] == 100
    assert results["ents_per_type"]["GPE"]["r"] == 100
    assert results["ents_per_type"]["GPE"]["f"] == 100
    assert results["ents_per_type"]["MONEY"]["p"] == 0
    assert results["ents_per_type"]["MONEY"]["r"] == 0
    assert results["ents_per_type"]["MONEY"]["f"] == 0
    assert results["ents_per_type"]["ORG"]["p"] == 50
    assert results["ents_per_type"]["ORG"]["r"] == 100
    assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
Example #12
0
def evaluate(Language,
             gold_tuples,
             model_dir,
             gold_preproc=False,
             verbose=False,
             beam_width=None):
    nlp = Language(data_dir=model_dir)
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer
Example #13
0
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
    gold_entities = {}
    tagged_ent_positions = {(ent.start_char, ent.end_char): ent
                            for ent in doc.ents
                            if ent.label_ not in labels_discard}

    for entity in entities:
        entity_id = entity["entity"]
        alias = entity["alias"]
        start = entity["start"]
        end = entity["end"]

        candidate_ids = []
        if kb and not dev:
            candidates = kb.get_candidates(alias)
            candidate_ids = [cand.entity_ for cand in candidates]

        tagged_ent = tagged_ent_positions.get((start, end), None)
        if tagged_ent:
            # TODO: check that alias == doc.text[start:end]
            should_add_ent = (dev or entity_id
                              in candidate_ids) and is_valid_sentence(
                                  tagged_ent.sent.text)

            if should_add_ent:
                value_by_id = {entity_id: 1.0}
                if not dev:
                    random.shuffle(candidate_ids)
                    value_by_id.update({
                        kb_id: 0.0
                        for kb_id in candidate_ids if kb_id != entity_id
                    })
                gold_entities[(start, end)] = value_by_id

    return GoldParse(doc, links=gold_entities)
Example #14
0
def test_goldparse_startswith_space(en_tokenizer):
    text = " a"
    doc = en_tokenizer(text)
    g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
    assert g.words == [" ", "a"]
    assert g.ner == [None, "U-DATE"]
    assert g.labels == [None, "ROOT"]
Example #15
0
 def train_recognizer(self):
     # Tentons une technique de NER par patrons en utilisant la librairie spaCy
     comp = self.nlp.create_pipe('ner')
     self.nlp.add_pipe(comp)
     comp.add_label("Task")
     comp.add_label("Material")
     comp.add_label("Process")
     # nlp.from_disk('C:/Users/Lobar/Desktop/TP3_NLP/spacy_models')
     optimizer = self.nlp.begin_training()
     losses = {}
     for training in self.data.data_train:
         f = open(self.repertory + self.train_file + "/" + training[0],
                  'r',
                  encoding="utf-8")
         text = f.readlines()
         text = text[0]
         entities = []
         for ent in training[1]:
             if ent[0] == "T":
                 splitted = re.split(r'\W+', ent[1])
                 entity = (int(splitted[1]), int(splitted[2]), splitted[0])
                 entities.append(entity)
         doc = self.nlp.make_doc(text)
         gold = GoldParse(doc, entities=entities)
         self.nlp.update([doc], [gold],
                         drop=0.5,
                         losses=losses,
                         sgd=optimizer)
         f.close()
         ''' A MODIFIER LE CHEMIN D'ACCES '''
         self.nlp.to_disk(
             "C:\\Users\\Lobar\\Desktop\\TP3_NLP\\spacy_models")
    def _from_json_to_crf(self, message, entity_offsets):
        # type: (Message, List[Tuple[int, int, Text]]) -> List[Tuple[Text, Text, Text, Text]]
        """Takes the json examples and switches them to a format which crfsuite likes."""
        from spacy.gold import GoldParse

        doc = message.get("spacy_doc")
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn(
                "Misaligned entity annotation in sentence '{}'. ".format(
                    doc.text) +
                "Make sure the start and end values of the annotated training "
                +
                "examples end at token boundaries (e.g. don't include trailing whitespaces)."
            )
        if not self.BILOU_flag:
            for i, entity in enumerate(ents):
                if entity.startswith('B-') or \
                        entity.startswith('I-') or \
                        entity.startswith('U-') or \
                        entity.startswith('L-'):
                    ents[i] = entity[2:]  # removes the BILOU tags

        return self._from_text_to_crf(message, ents)
Example #17
0
    def _from_json_to_crf(self, json_eg, spacy_nlp):
        # type: (Tuple[Text, List[Tuple[int, int, Text]]], Language) -> List[Tuple[Text, Text, Text]]
        """Takes the json examples and switches them to a format which crfsuite likes."""
        from spacy.language import Language
        from spacy.gold import GoldParse

        doc = spacy_nlp(json_eg[0])
        entity_offsets = json_eg[1]
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn("Misaligned entity annotation in sentence '{}'. ".format(doc.text) +
                        "Make sure the start and end values of the annotated training " +
                        "examples end at token boundaries (e.g. don't include trailing whitespaces).")
        if not self.BILOU_flag:
            def ent_clean(entity):
                if entity.startswith('B-') or entity.startswith('I-') or entity.startswith('U-') or entity.startswith(
                        'L-'):
                    return entity[2:]
                else:
                    return entity
        else:
            def ent_clean(entity):
                return entity

        crf_format = [(doc[i].text, doc[i].tag_, ent_clean(ents[i])) for i in range(len(doc))]
        return crf_format
def main(output_dir=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        ensure_dir(output_dir)
        ensure_dir(output_dir / "pos")
        ensure_dir(output_dir / "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger(vocab)
    for i in range(25):
        for words, tags in DATA:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            tagger.update(doc, gold)
        random.shuffle(DATA)
    tagger.model.end_training()
    doc = Doc(vocab,
              orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4))
    tagger(doc)
    for word in doc:
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
def main(n_iter=10):
    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            annotations, _ = annot_brackets
            doc = nlp.make_doc(text)
            gold = GoldParse.from_annot_tuples(doc, annotations[0])
            nlp.update(
                [doc],  # batch of texts
                [gold],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses.get('nn_labeller', 0.0), losses['ner'])

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #20
0
def train_NER(train_data, t1Files):
    nlp = spacy.load('en', entity=False, parser=False)
    ner = EntityRecognizer(nlp.vocab,
                           entity_types=[
                               'ID', 'INCIDENT', 'WEAPON', 'PERP INDIV',
                               'PERP ORG', 'TARGET', 'VICTIM'
                           ])

    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offset in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offset)

            nlp.tagger(doc)
            ner.update(doc, gold)

    ner.model.end_training()

    for i in range(len(t1Files)):
        test_file_sentences = t1Files[i][1][1]
        for j in range(len(test_file_sentences)):
            s = unicode(test_file_sentences[j])
            doc = nlp(s, entity=False)
            ner(doc)
            print("Entites on fine tuned NER:")
            for word in doc:
                print(word.text, word.orth, word.lower, word.tag_,
                      word.ent_type_, word.ent_iob)
Example #21
0
def train_model(labels, examples, epochs=10, verbose=False):
    nlp = spacy.blank('ru')
    ner = create_ner(nlp)
    nlp.add_pipe(ner, last=True)
    for l in labels:
        print("Label:", l)
        ner.add_label(l)

    optimizer = nlp.begin_training()

    if verbose:
        print("Training data:")
        for t in examples:
            # print(t['text'])
            for ls, le, lt in t['labels']:
                print('{} : "{}"'.format(lt, t['text'][ls: le]))

    for e in tqdm(range(epochs)):
        for batch in minibatch([e for e in examples], size=1):
            # print([t['labels'] for t in batch])
            docs = [nlp.tokenizer(t['text']) for t in batch]
            goldparses = [GoldParse(d, entities=t['labels']) for d, t in zip(docs, batch)]
            losses = {}
            nlp.update(docs, goldparses, drop=0.5, losses=losses, sgd=optimizer)

    return nlp
 def _update_ner_model(self, ner, nlp, train_data):
     for itn in range(5):
         random.shuffle(train_data)
         for raw_text, entity_offsets in train_data:
             doc = nlp.make_doc(raw_text)
             gold = GoldParse(doc, entities=entity_offsets)
             ner.update(doc, gold)
def extract_docs_and_golds_from_opencorpora(nlp, opencorpora_file):
    parsed_sentences = []
    gold_sentences = []

    with open(opencorpora_file, "r") as f:
        opencorpora = f.read().encode('utf-8')

    page_tree = html.fromstring(opencorpora)

    for text in page_tree.xpath('//text'):
        for paragraphs in text.xpath('./paragraphs'):
            for paragraph in paragraphs.xpath('./paragraph'):
                for sentence in paragraph.xpath('./sentence'):
                    text = sentence.xpath('./source')[0].text
                    parsed_sentences.append(nlp(text))
                    sent_words = [
                        token.attrib['text']
                        for token in sentence.xpath('./tokens/token')
                    ]
                    gold = GoldParse(
                        Doc(nlp.vocab, words=sent_words),
                        words=sent_words,  # heads=sent_heads,
                        # tags=sent_tags, deps=sent_deps,
                        entities=['-'] * len(sent_words))
                    gold_sentences.append(gold)
    return parsed_sentences, gold_sentences
Example #24
0
    def evaluate(self, verbose=1):
        """Do evaluation on test data
        
        Parameters
        ----------
        verbose : bool
            print out the wrong case from prediction
        """
        scorer = Scorer()
        wrong_case = 0
        for input_, annot in self.data:

            doc_gold_text = self.nlp.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=annot['entities'])
            pred_value = self.nlp(input_)
            #return gold

            current_score = scorer.ents_f
            scorer.score(pred_value, gold)
            if (current_score > scorer.ents_f):
                wrong_case += 1
                if (verbose == 1):
                    print_beauty_NER(prediction_to_IOB(pred_value, gold))

            current_score = scorer.ents_f

        return scorer.scores  #, wrong_case, len(self.data)
Example #25
0
def evaluate(Language,
             gold_tuples,
             model_dir,
             gold_preproc=False,
             verbose=False,
             beam_width=None,
             cand_preproc=None):
    nlp = Language(data_dir=model_dir)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string, pos: set([string])
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.parser(tokens)
                nlp.entity(tokens)
            else:
                tokens = nlp(raw_text)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer
Example #26
0
def test_textcat_learns_multilabel():
    random.seed(5)
    numpy.random.seed(5)
    docs = []
    nlp = Language()
    letters = ["a", "b", "c"]
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2 == letter) for letter in letters}
            docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
    random.shuffle(docs)
    model = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        model.add_label(letter)
    optimizer = model.begin_training()
    for i in range(30):
        losses = {}
        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
        Xs = [doc for doc, cats in docs]
        model.update(Xs, Ys, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
            truth = {letter: w2 == letter for letter in letters}
            model(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
Example #27
0
def main(n_iter=10):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)
    print(nlp.pipeline)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            for annotations, _ in annot_brackets:
                doc = Doc(nlp.vocab, words=annotations[1])
                gold = GoldParse.from_annot_tuples(doc, annotations)
                nlp.update(
                    [doc],  # batch of texts
                    [gold],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses,
                )
        print(losses.get("nn_labeller", 0.0), losses["ner"])

    # test the trained model
    for text, _ in TRAIN_DATA:
        if text is not None:
            doc = nlp(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def test_tokenizer():
    doc: Doc = pytest.nlp.make_doc("Ceci est un test.")
    offsets = [(0, 4, "PERS"), (9, 11, "PERS")]
    gold: GoldParse = GoldParse(doc, entities=offsets)
    word_extracted = [doc.char_span(o[0], o[1]) for o in offsets]
    count_ent = sum([1 for item in gold.ner if item != "O"])
    assert count_ent == len(word_extracted)

    offsets = [(0, 4, "PERS"), (9, 12, "PERS")]
    gold: GoldParse = GoldParse(doc, entities=offsets)
    word_extracted = [
        doc.char_span(o[0], o[1]) for o in offsets
        if doc.char_span(o[0], o[1]) is not None
    ]
    count_ent = sum([1 for item in gold.ner if item != "O"])
    assert count_ent > len(word_extracted)
Example #29
0
    def _from_json_to_crf(self, json_eg, spacy_nlp):
        # type: (Tuple[Text, List[Tuple[int, int, Text]]], Language) -> List[Tuple[Text, Text, Text]]
        """Takes the json examples and switches them to a format which crfsuite likes."""
        from spacy.language import Language
        from spacy.gold import GoldParse

        doc = spacy_nlp(json_eg[0])
        entity_offsets = json_eg[1]
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if not self.BILOU_flag:

            def ent_clean(entity):
                if entity.startswith('B-') or entity.startswith(
                        'I-') or entity.startswith('U-') or entity.startswith(
                            'L-'):
                    return entity[2:]
                else:
                    return entity
        else:

            def ent_clean(entity):
                return entity

        crf_format = [(doc[i].text, doc[i].tag_, ent_clean(ents[i]))
                      for i in range(len(doc))]
        return crf_format
    def evaluate_ner(self, model, eval_set, ent_types):
        """Evaluate the performance of a Named Entity model
        
        Arguments:
            model (spacy model object) -- trained Named Entity model to evaluate
            eval_set (list) -- Evaluation set passed in the format 
                                [["<doc_text>",{"entities:[[<start_pos>,<end_pos>,"<ENTITY_TYPE>"],
                                                        [<start_pos>,<end_pos>,"<ENTITY_TYPE>"]]}]]
            ent_types (list) -- list with what entities types to extract
        
        Returns:
            (Spacy.scorer.scores) -- scored metrics for the model 
        """

        scorer = Scorer()

        for data, expected_result in eval_set:
            selected_entities = []
            for ent in expected_result.get('entities'):
                if ent[-1] in ent_types:
                    selected_entities.append(ent)

            ground_truth_text = model.make_doc(data)
            ground_truth = GoldParse(ground_truth_text,
                                     entities=selected_entities)
            pred_value = model(data)
            scorer.score(pred_value, ground_truth)

        return scorer.scores
    def predict(self, list_data):
        """
        Method that performs prediction on a given dataset
        :param list_data: list of data given in the format expected by spaCy
                                E.g. [(This is a nice summer, {"entities":(15, 21, SEASON)})]
        :return: dict_performance - dictionary where keys are precision, recall and F1 and values are the
                                    corresponding values of such metrics
        """

        # load customized NER model
        nlp_custom = spacy.load(self.output_dir)

        # instantiate scorer
        scorer = Scorer()

        # loop over list of data given
        for input_, annotation_ in list_data:
            doc_gold_text = nlp_custom.make_doc(input_)
            gold = GoldParse(doc_gold_text, entities=annotation_['entities'])
            pred_value = nlp_custom(input_)
            scorer.score(pred_value, gold)

        # create dictionary to be returned...
        dict_perf = scorer.scores
        dict_perf_out = {
            'precision': dict_perf['ents_p'],
            'recall': dict_perf['ents_r'],
            'F1': dict_perf['ents_f']
        }
        self.dict_performance = dict_perf_out

        # ... and return it
        return self.dict_performance
Example #32
0
def evaluate(tokenizer, nlp, valid_data, labels):
    """Evaluate model performance on a test dataset."""
    texts, cats = zip(*valid_data)

    golds = []
    # Use the model's ops module
    # to make sure this is compatible with GPU (cupy array)
    # or without (numpy array)
    scores = np.zeros((len(cats), len(labels)), dtype="f")
    if is_transformer(nlp):
        textcat = nlp.get_pipe(PIPES.textcat)
    else:
        textcat = nlp.get_pipe("textcat")
    scores = textcat.model.ops.asarray(scores)

    num_correct = 0
    for i, doc in enumerate(nlp.pipe(texts)):
        gold_cats = cats[i]["cats"]
        for j, (label, score) in enumerate(doc.cats.items()):
            if label not in gold_cats:
                raise ValueError(f"Prediction for unexpected label: {label}")

            scores[i, j] = score

            doc_prediction = score > 0.5
            if doc_prediction == bool(gold_cats[label]):
                num_correct += 1

        golds.append(GoldParse(doc, cats=gold_cats))

    accuracy = num_correct / ((len(texts) * len(labels)) + 1e-8)
    loss, _ = textcat.get_loss(texts, golds, scores)

    return accuracy, loss
Example #33
0
    def _from_json_to_crf(
            self,
            message,  # type: Message
            entity_offsets  # type: List[Tuple[int, int, Text]]
    ):
        # type: (...) -> List[Tuple[Text, Text, Text, Text]]
        """Convert json examples to format of underlying crfsuite."""
        from spacy.gold import GoldParse

        doc = message.get("spacy_doc")
        gold = GoldParse(doc, entities=entity_offsets)
        ents = [l[5] for l in gold.orig_annot]
        if '-' in ents:
            logger.warn("Misaligned entity annotation in sentence '{}'. "
                        "Make sure the start and end values of the "
                        "annotated training examples end at token "
                        "boundaries (e.g. don't include trailing "
                        "whitespaces).".format(doc.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
Example #34
0
def train(nlp, data, ents, num_iterations=20):
    """

    :param nlp: nlp instance
    :param data: training data(look at required format below)
    :param ents: list of entities
    :param num_iterations: number iterations to train
    :return: trained NER tagger
    """

    # Example :
    # train_data = [
    #     (
    #         'Who is Shaka Khan?',
    #         [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
    #     ), ...
    # ]

    for sent, _ in data:
        doc = nlp.make_doc(sent)
        for word in doc:
            _ = nlp.vocab[word.orth]

    result_NER = EntityRecognizer(nlp.vocab, entity_types=ents)
    for _ in range(num_iterations):
        random.shuffle(data)
        for sent, entity_offsets in data:
            doc = nlp.make_doc(sent)
            gold = GoldParse(doc, entities=entity_offsets)
            result_NER.update(doc, gold)
    return result_NER