class CRFsuiteEntityRecognizer: def __init__( self, feature_extractor: WindowedTokenFeatureExtractor, encoder: EntityEncoder ) -> None: self.feature_extractor = feature_extractor self._encoder = encoder self.tagger = Tagger() @property def encoder(self) -> EntityEncoder: return self._encoder def train(self, docs: Iterable[Doc], algorithm: str, params: dict, path: str) -> None: trainer = Trainer(algorithm=algorithm, params=params, verbose=False) for doc in docs: for sentence in doc.sents: tokens = list(sentence) features = self.feature_extractor.extract( [str(token) for token in tokens] ) labels = self.encoder.encode(tokens) trainer.append(features, labels) trainer.train(path) self.tagger.close() self.tagger.open(path) def __call__(self, doc: Doc) -> Doc: doc_ent = [] for sentence in doc.sents: tokens = list(sentence) labels = self.predict_labels([str(token) for token in tokens]) entities = decode_bilou(labels, tokens, doc) # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities))) for entity in entities: doc_ent.append(entity) doc.ents = doc_ent return doc def predict_labels(self, tokens: Sequence[str]) -> List[str]: features = self.feature_extractor.extract(tokens) return self.tagger.tag(features)
# -- Start classification ------------------------------------------------ for seq in range(len(dataset)): # -- with crfsuite s_ = tagger.tag(data['X'][seq]) y_ = np.array([int(model.labels[s]) for s in s_]) prob_ = tagger.probability(s_) print "\n-- With crfsuite:" print "labels:\n", s_, "\n", y_ print "probability:\t %f" % prob_ # -- with flexcrf f_xy, y = dataset[seq] theta = thetas[seq] m_xy, f_m_xy = _compute_all_potentials(f_xy, theta) y_pred = viterbi_decoder(m_xy) # ADD CODE TO COMPUTE POSTERIOR PROBABILITY WITH FLEXCRF HERE .... print "-- With flexcrf:" print "labels:\n", y_pred print "equal predictions: ", all(y_pred == y_) #print "probability:\t %f" % prob #print "delta:\t %f" % abs(prob-prob_) tagger.close()