def crf_predict( tagger: pycrfsuite.Tagger, gp_data: list, mode: str = 'raw', exclude_labels: list = ['NOL', 'NAT', 'NEE'] ) -> Union[list, Tuple[list, pd.DataFrame]]: """Return predictions for the test data, grouped by file. 3 modes for return: * Return raw predictions (raw) * Return predictions with only valid tags (exclude_ool) * Return predictions (valid tags) and probabilities for each class (rt_proba) Predictions are returned unflattened https://python-crfsuite.readthedocs.io/en/latest/pycrfsuite.html """ if mode not in ['raw', 'exclude_ool', 'rt_proba']: raise ValueError( f"mode must be one of raw|exclude_ool|rt_proba; currently {mode}") if mode == 'raw': return [tagger.tag(xseq) for xseq in gp_data] labels = tagger.labels() res = [] y_pred = [] for fi, xseq in enumerate(gp_data): tagger.set(xseq) file_proba = pd.DataFrame({ label: [tagger.marginal(label, i) for i in range(len(xseq))] for label in labels }) y_pred.append(file_proba[[ col for col in file_proba.columns if col not in exclude_labels ]].idxmax(axis=1).tolist()) file_proba['file_id'] = fi res.append(file_proba) if mode == 'rt_proba': return y_pred, pd.concat(res, axis=0) return y_pred # else
class PassageTagger(object): def __init__(self, do_train=False, trained_model_name="passage_crf_model", algorithm="crf"): self.trained_model_name = trained_model_name self.fp = FeatureProcessing() self.do_train = do_train self.algorithm = algorithm if algorithm == "crf": if do_train: self.trainer = Trainer() else: self.tagger = Tagger() else: if do_train: model = ChainCRF() self.trainer = FrankWolfeSSVM(model=model) self.feat_index = {} self.label_index = {} else: self.tagger = pickle.load(open(self.trained_model_name, "rb")) self.feat_index = pickle.load(open("ssvm_feat_index.pkl", "rb")) label_index = pickle.load(open("ssvm_label_index.pkl", "rb")) self.rev_label_index = {i: x for x, i in label_index.items()} def read_input(self, filename): str_seqs = [] str_seq = [] feat_seqs = [] feat_seq = [] label_seqs = [] label_seq = [] for line in codecs.open(filename, "r", "utf-8"): lnstrp = line.strip() if lnstrp == "": if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] else: if self.do_train: clause, label = lnstrp.split("\t") label_seq.append(label) else: clause = lnstrp str_seq.append(clause) feats = self.fp.get_features(clause) feat_dict = {} for f in feats: if f in feat_dict: feat_dict[f] += 1 else: feat_dict[f] = 1 #feat_dict = {i: v for i, v in enumerate(feats)} feat_seq.append(feat_dict) if len(str_seq) != 0: str_seqs.append(str_seq) str_seq = [] feat_seqs.append(feat_seq) feat_seq = [] label_seqs.append(label_seq) label_seq = [] return str_seqs, feat_seqs, label_seqs def predict(self, feat_seqs): print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs) if self.algorithm == "crf": self.tagger.open(self.trained_model_name) preds = [] for feat_seq in feat_seqs: pred = self.tagger.tag(ItemSequence(feat_seq)) marginals = [self.tagger.marginal(p, i) for i, p in enumerate(pred)] preds.append(zip(pred, marginals)) #preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs] else: Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: if f in self.feat_index: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) pred_ind_seqs = self.tagger.predict(Xs) preds = [] for ps in pred_ind_seqs: pred = [] for pred_ind in ps: pred.append(self.rev_label_index[pred_ind]) preds.append(pred) return preds def train(self, feat_seqs, label_seqs, cv=True): print >>sys.stderr, "Training on %d sequences"%len(feat_seqs) if self.algorithm == "crf": feat_label_zip = zip(feat_seqs, label_seqs) shuffle(feat_label_zip) for i, (feat_seq, label_seq) in enumerate(feat_label_zip): self.trainer.append(ItemSequence(feat_seq), label_seq, group=i%5) if cv: for hl in range(5): self.trainer.train('', holdout=hl) self.trainer.train(self.trained_model_name) else: for fs in feat_seqs: for feat_dict in fs: for f in feat_dict: if f not in self.feat_index: self.feat_index[f] = len(self.feat_index) Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) for ls in label_seqs: for label in ls: if label not in self.label_index: self.label_index[label] = len(self.label_index) Ys = [] for ls in label_seqs: Y = [] for label in ls: Y.append(self.label_index[label]) Ys.append(numpy.asarray(Y)) self.trainer.fit(Xs, Ys) pickle.dump(self.trainer, open(self.trained_model_name, "wb")) pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb")) pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))