class CRFsuiteEntityRecognizer:
    def __init__(
        self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder
    ) -> None:
        self.feature_extractor = feature_extractor
        self._encoder = encoder
        self.tagger = Tagger()

    @property
    def encoder(self) -> EntityEncoder:
        return self._encoder

    def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None:
        trainer = Trainer(algorithm=algorithm, params=params, verbose=False)
        for doc in docs:
            for sentence in doc.sents:
                tokens = list(sentence)
                features = self.feature_extractor.extract(
                    [str(token) for token in tokens]
                )
                labels = self.encoder.encode(tokens)
                trainer.append(features, labels)
        trainer.train(path)
        self.tagger.close()
        self.tagger.open(path)

    def __call__(self, doc: Doc) -> Doc:
        doc_ent = []
        for sentence in doc.sents:
            tokens = list(sentence)
            labels = self.predict_labels([str(token) for token in tokens])
            entities = decode_bilou(labels, tokens, doc)
            # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities)))
            for entity in entities:
                doc_ent.append(entity)
        doc.ents = doc_ent
        return doc

    def predict_labels(self, tokens: Sequence[str]) -> List[str]:
        features = self.feature_extractor.extract(tokens)
        return self.tagger.tag(features)
Example #2
0
# -- Start classification ------------------------------------------------

for seq in range(len(dataset)):
    # -- with crfsuite
    s_ = tagger.tag(data['X'][seq])
    y_ = np.array([int(model.labels[s]) for s in s_])
    prob_ = tagger.probability(s_)

    print "\n-- With crfsuite:"
    print "labels:\n", s_, "\n", y_
    print "probability:\t %f" % prob_

    # -- with flexcrf
    f_xy, y = dataset[seq]
    theta = thetas[seq]

    m_xy, f_m_xy = _compute_all_potentials(f_xy, theta)

    y_pred = viterbi_decoder(m_xy)

    # ADD CODE TO COMPUTE POSTERIOR PROBABILITY WITH FLEXCRF HERE ....

    print "-- With flexcrf:"
    print "labels:\n", y_pred
    print "equal predictions: ", all(y_pred == y_)
    #print "probability:\t %f" % prob
    #print "delta:\t %f" % abs(prob-prob_)

tagger.close()