def train_crf(train_path, model_path, _log, _run, dev_path=None): train_reader = read_corpus(train_path) _log.info('Extracting features from train corpus') train_itemseq = ItemSequence([ fs for sent in train_reader.sents() for fs in extract_crf_features(sent) ]) train_labels = [tag for _, tag in train_reader.tagged_words()] trainer = make_crf_trainer() trainer.append(train_itemseq, train_labels, group=0) if dev_path is not None: dev_reader = read_corpus(dev_path, name='dev') _log.info('Extracting features from dev corpus') dev_itemseq = ItemSequence([ fs for sent in dev_reader.sents() for fs in extract_crf_features(sent) ]) dev_labels = [tag for _, tag in dev_reader.tagged_words()] trainer.append(dev_itemseq, dev_labels, group=1) _log.info('Begin training; saving model to %s', model_path) holdout = -1 if dev_path is None else 1 trainer.train(model_path, holdout=holdout) if SACRED_OBSERVE_FILES: _run.add_artifact(model_path)
def train(model_path, _log, _run, window=2): """Train a CRF model.""" train_reader = read_train_corpus() sents, tags = separate_tagged_sents(train_reader.tagged_sents()) sents = preprocess(sents) _log.info('Extracting features from train corpus') train_itemseq = ItemSequence( [fs for sent in sents for fs in extract_crf_features(sent)]) train_labels = [tag for tags_ in tags for tag in tags_] trainer = make_crf_trainer() trainer.append(train_itemseq, train_labels, group=0) dev_reader = read_dev_corpus() if dev_reader is not None: _log.info('Extracting features from dev corpus') sents, tags = separate_tagged_sents(dev_reader.tagged_sents()) sents = preprocess(sents) dev_itemseq = ItemSequence( [fs for sent in sents for fs in extract_crf_features(sent)]) dev_labels = [tag for tags_ in tags for tag in tags_] trainer.append(dev_itemseq, dev_labels, group=1) _log.info('Begin training; saving model to %s', model_path) holdout = -1 if dev_reader is None else 1 trainer.train(model_path, holdout=holdout) if SACRED_OBSERVE_FILES: _run.add_artifact(model_path)
def build_model_features(sample, window_size, label_data): """ :param sample: 一个DataSample对象 :param window_size: :param label_data: :return: sample对应的ItemSequence """ features = [] chars = list(sample.sentence) for index, word in enumerate(chars): ''' build features ''' windows_words = [i if i > 0 else -1 for i in range(index - window_size // 2, index)] windows_words.extend([i if i < len(chars) else -1 for i in range(index, index + window_size // 2 + 1)]) feature = word_feature(sample, windows_words, label_data) features.append(feature.calWord_features(index)) # print(features) item_sequence = ItemSequence(features) return item_sequence
def train(self, training_data, classifier_path="classifier/cache/label_crf_classifier", c1=0, c2=10, period=300, minfreq=5): self.preprocess(training_data) train = Trainer() for i1, i in enumerate(self.x): train.append(ItemSequence(i), self.y[i1]) params = { "c1": c1, "c2": c2, "period": period, "feature.minfreq": minfreq, "max_iterations": 1000 # "calibration.eta": 0.05, # "calibration_samples": 400, } # train.select(algorithm = "l2sgd") train.set_params(params) train.train(classifier_path) self.tagger = Tagger() self.tagger.open(classifier_path)
def make_preds(tagger, sents, _log): sents = preprocess(sents) _log.info('Extracting features') itemseq = ItemSequence( [fs for sent in sents for fs in extract_crf_features(sent)]) _log.info('Making predictions with the model') return tagger.tag(itemseq)
def train(features: pd.Series, labels: pd.Series) -> None: trainer = Trainer(verbose=False) features = features.tolist() labels = labels.tolist() for idx in range(len(features)): trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx])) trainer.train('crf.model')
def train(features, labels): print("Training..") trainer = Trainer(verbose=False) features = features.tolist() labels = labels.tolist() for idx in range(0, len(features)): trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx])) trainer.train('crf.model')
def predict_crf(reader, model_path, _log, _run): _log.info('Loading model from %s', model_path) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) tagger = Tagger() tagger.open(model_path) _log.info('Extracting features from test corpus') itemseq = ItemSequence( [fs for sent in reader.sents() for fs in extract_crf_features(sent)]) _log.info('Making predictions with the model') return tagger.tag(itemseq)
def predict(self, test_data): """Input: x should be a list of strings""" result = self.tagger.tag(ItemSequence(test_data["crf_feature"])) test_data["section_label"] = result return test_data # def score(self, training_data, classifier_path="classifier/cache/label_crf_classifier", portion=0.8, c1=0, c2=10, period=300, minfreq=10): # # split resume_id # resume_ids = np.unique([resume['resume_id'] for resume in training_data]) # length = len(resume_ids) # shuffle(resume_ids) # train_ids = resume_ids[:int(length*portion)] # test_ids = resume_ids[int(length*portion):] # train_df = [resume for resume in training_data if resume['resume_id'] in train_ids] # test_df = [resume for resume in training_data if resume['resume_id'] in test_ids] # # train model on train_ids # self.train(train_df, classifier_path=classifier_path, c1=c1, c2=c2, period=period, minfreq=minfreq) # test_pred = self.predict_all(test_df) # train_pred = self.predict_all(train_df) # # print out result # return train_pred, test_pred # if __name__ == "__main__": # data = MongoRetriveData() # resumes = data.get_data_mongo() # # pickle.dump(resumes, open('./resume_data.pkl', 'wb')) # # resumes = pickle.load(open('./resume_data.pkl', 'rb')) # stopword_path = './stopword.txt' # model_path = './model.txt' # resume_data = resumes # clf = Crf(stopword_path, model_path, resume_data) # clf.CleanData() # clf.Fit() # clf.Score() # # result = clf.Predict(clf.data) # # print result
def train(self, feat_seqs, label_seqs): print >>sys.stderr, "Training on %d sequences"%len(feat_seqs) if self.algorithm == "crf": for feat_seq, label_seq in zip(feat_seqs, label_seqs): self.trainer.append(ItemSequence(feat_seq), label_seq) self.trainer.train(self.trained_model_name) else: for fs in feat_seqs: for feat_dict in fs: for f in feat_dict: if f not in self.feat_index: self.feat_index[f] = len(self.feat_index) Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) for ls in label_seqs: for label in ls: if label not in self.label_index: self.label_index[label] = len(self.label_index) Ys = [] for ls in label_seqs: Y = [] for label in ls: Y.append(self.label_index[label]) Ys.append(numpy.asarray(Y)) self.trainer.fit(Xs, Ys) pickle.dump(self.trainer, open(self.trained_model_name, "wb")) pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb")) pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
def predict(self, feat_seqs): print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs) if self.algorithm == "crf": self.tagger.open(self.trained_model_name) preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs] else: Xs = [] for fs in feat_seqs: X = [] for feat_dict in fs: x = [0] * len(self.feat_index) for f in feat_dict: if f in self.feat_index: x[self.feat_index[f]] = feat_dict[f] X.append(x) Xs.append(numpy.asarray(X)) pred_ind_seqs = self.tagger.predict(Xs) preds = [] for ps in pred_ind_seqs: pred = [] for pred_ind in ps: pred.append(self.rev_label_index[pred_ind]) preds.append(pred) return preds
def one_hot_encoding(self,features): encoding = ItemSequence(features[0]) self.encoding = encoding.items() return self.encoding
def _get_xseq(model, matrix): xseq = [{ 'feat{}'.format(i): float(w) for i, w in enumerate(list(features)) } for features in model.predict(matrix)] return ItemSequence(xseq)
def test_tag_item_sequence(model_filename, xseq, yseq): with Tagger().open(model_filename) as tagger: assert tagger.tag(ItemSequence(xseq)) == yseq
def feed_trainer(self, features_list, labels): features = ItemSequence(features_list) self.trainer.append(features, labels)
def tag_tweets(self, tweet_features_list): features = ItemSequence(tweet_features_list) labels = self.tagger.tag(features) return labels
def sent2features(sent): return ItemSequence([word2features(sent, i) for i in range(len(sent))])