Example #1
0
def train_crf(train_path, model_path, _log, _run, dev_path=None):
    train_reader = read_corpus(train_path)
    _log.info('Extracting features from train corpus')
    train_itemseq = ItemSequence([
        fs for sent in train_reader.sents()
        for fs in extract_crf_features(sent)
    ])
    train_labels = [tag for _, tag in train_reader.tagged_words()]

    trainer = make_crf_trainer()
    trainer.append(train_itemseq, train_labels, group=0)

    if dev_path is not None:
        dev_reader = read_corpus(dev_path, name='dev')
        _log.info('Extracting features from dev corpus')
        dev_itemseq = ItemSequence([
            fs for sent in dev_reader.sents()
            for fs in extract_crf_features(sent)
        ])
        dev_labels = [tag for _, tag in dev_reader.tagged_words()]
        trainer.append(dev_itemseq, dev_labels, group=1)

    _log.info('Begin training; saving model to %s', model_path)
    holdout = -1 if dev_path is None else 1
    trainer.train(model_path, holdout=holdout)
    if SACRED_OBSERVE_FILES:
        _run.add_artifact(model_path)
Example #2
0
def train(model_path, _log, _run, window=2):
    """Train a CRF model."""
    train_reader = read_train_corpus()
    sents, tags = separate_tagged_sents(train_reader.tagged_sents())
    sents = preprocess(sents)
    _log.info('Extracting features from train corpus')
    train_itemseq = ItemSequence(
        [fs for sent in sents for fs in extract_crf_features(sent)])
    train_labels = [tag for tags_ in tags for tag in tags_]

    trainer = make_crf_trainer()
    trainer.append(train_itemseq, train_labels, group=0)

    dev_reader = read_dev_corpus()
    if dev_reader is not None:
        _log.info('Extracting features from dev corpus')
        sents, tags = separate_tagged_sents(dev_reader.tagged_sents())
        sents = preprocess(sents)
        dev_itemseq = ItemSequence(
            [fs for sent in sents for fs in extract_crf_features(sent)])
        dev_labels = [tag for tags_ in tags for tag in tags_]
        trainer.append(dev_itemseq, dev_labels, group=1)

    _log.info('Begin training; saving model to %s', model_path)
    holdout = -1 if dev_reader is None else 1
    trainer.train(model_path, holdout=holdout)
    if SACRED_OBSERVE_FILES:
        _run.add_artifact(model_path)
Example #3
0
def build_model_features(sample, window_size, label_data):
    """
    :param sample:  一个DataSample对象
    :param window_size:
    :param label_data:
    :return: sample对应的ItemSequence
    """
    features = []

    chars = list(sample.sentence)
    for index, word in enumerate(chars):
        '''
        build features 
        '''
        windows_words = [i if i > 0 else -1 for i in range(index - window_size // 2, index)]
        windows_words.extend([i if i < len(chars) else -1 for i in range(index, index + window_size // 2 + 1)])

        feature = word_feature(sample, windows_words, label_data)

        features.append(feature.calWord_features(index))

    # print(features)
    item_sequence = ItemSequence(features)

    return item_sequence
 def train(self,
           training_data,
           classifier_path="classifier/cache/label_crf_classifier",
           c1=0,
           c2=10,
           period=300,
           minfreq=5):
     self.preprocess(training_data)
     train = Trainer()
     for i1, i in enumerate(self.x):
         train.append(ItemSequence(i), self.y[i1])
     params = {
         "c1": c1,
         "c2": c2,
         "period": period,
         "feature.minfreq": minfreq,
         "max_iterations": 1000
         # "calibration.eta": 0.05,
         # "calibration_samples": 400,
     }
     # train.select(algorithm = "l2sgd")
     train.set_params(params)
     train.train(classifier_path)
     self.tagger = Tagger()
     self.tagger.open(classifier_path)
Example #5
0
def make_preds(tagger, sents, _log):
    sents = preprocess(sents)
    _log.info('Extracting features')
    itemseq = ItemSequence(
        [fs for sent in sents for fs in extract_crf_features(sent)])

    _log.info('Making predictions with the model')
    return tagger.tag(itemseq)
def train(features: pd.Series, labels: pd.Series) -> None:
    trainer = Trainer(verbose=False)
    features = features.tolist()
    labels = labels.tolist()

    for idx in range(len(features)):
        trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx]))
    trainer.train('crf.model')
Example #7
0
def train(features, labels):
    print("Training..")
    trainer = Trainer(verbose=False)
    features = features.tolist()
    labels = labels.tolist()

    for idx in range(0, len(features)):
        trainer.append(ItemSequence(features[idx]), literal_eval(labels[idx]))
    trainer.train('crf.model')
Example #8
0
def predict_crf(reader, model_path, _log, _run):
    _log.info('Loading model from %s', model_path)
    if SACRED_OBSERVE_FILES:
        _run.add_resource(model_path)
    tagger = Tagger()
    tagger.open(model_path)

    _log.info('Extracting features from test corpus')
    itemseq = ItemSequence(
        [fs for sent in reader.sents() for fs in extract_crf_features(sent)])

    _log.info('Making predictions with the model')
    return tagger.tag(itemseq)
    def predict(self, test_data):
        """Input: x should be a list of strings"""
        result = self.tagger.tag(ItemSequence(test_data["crf_feature"]))
        test_data["section_label"] = result
        return test_data

    # def score(self, training_data, classifier_path="classifier/cache/label_crf_classifier", portion=0.8, c1=0, c2=10, period=300, minfreq=10):
    #     # split resume_id
    #     resume_ids = np.unique([resume['resume_id'] for resume in training_data])
    #     length = len(resume_ids)
    #     shuffle(resume_ids)
    #     train_ids = resume_ids[:int(length*portion)]
    #     test_ids = resume_ids[int(length*portion):]

    #     train_df = [resume for resume in training_data if resume['resume_id'] in train_ids]
    #     test_df = [resume for resume in training_data if resume['resume_id'] in test_ids]

    #     # train model on train_ids
    #     self.train(train_df, classifier_path=classifier_path, c1=c1, c2=c2, period=period, minfreq=minfreq)
    #     test_pred = self.predict_all(test_df)
    #     train_pred = self.predict_all(train_df)

    #     # print out result
    #     return train_pred, test_pred


# if __name__ == "__main__":
#     data = MongoRetriveData()
#     resumes = data.get_data_mongo()
#     # pickle.dump(resumes, open('./resume_data.pkl', 'wb'))
#     # resumes = pickle.load(open('./resume_data.pkl', 'rb'))
#     stopword_path = './stopword.txt'
#     model_path = './model.txt'
#     resume_data = resumes
#     clf = Crf(stopword_path, model_path, resume_data)
#     clf.CleanData()
#     clf.Fit()
#     clf.Score()
#     # result = clf.Predict(clf.data)
#     # print result
Example #10
0
  def train(self, feat_seqs, label_seqs):
    print >>sys.stderr, "Training on %d sequences"%len(feat_seqs)
    if self.algorithm == "crf":
      for feat_seq, label_seq in zip(feat_seqs, label_seqs):
        self.trainer.append(ItemSequence(feat_seq), label_seq)
      self.trainer.train(self.trained_model_name)
    else:
      for fs in feat_seqs:
        for feat_dict in fs:
          for f in feat_dict:
            if f not in self.feat_index:
              self.feat_index[f] = len(self.feat_index)
      Xs = []
      for fs in feat_seqs:
        X = []
        for feat_dict in fs:
          x = [0] * len(self.feat_index)
          for f in feat_dict:
            x[self.feat_index[f]] = feat_dict[f]
          X.append(x)
        Xs.append(numpy.asarray(X))

      for ls in label_seqs:
        for label in ls:
          if label not in self.label_index:
            self.label_index[label] = len(self.label_index)

      Ys = []
      for ls in label_seqs:
        Y = []
        for label in ls:
          Y.append(self.label_index[label])
        Ys.append(numpy.asarray(Y))

      self.trainer.fit(Xs, Ys)
      pickle.dump(self.trainer, open(self.trained_model_name, "wb"))
      pickle.dump(self.feat_index, open("ssvm_feat_index.pkl", "wb"))
      pickle.dump(self.label_index, open("ssvm_label_index.pkl", "wb"))
Example #11
0
 def predict(self, feat_seqs):
   print >>sys.stderr, "Tagging %d sequences"%len(feat_seqs)
   if self.algorithm == "crf":
     self.tagger.open(self.trained_model_name)
     preds = [self.tagger.tag(ItemSequence(feat_seq)) for feat_seq in feat_seqs]
   else:
     Xs = []
     for fs in feat_seqs:
       X = []
       for feat_dict in fs:
         x = [0] * len(self.feat_index)
         for f in feat_dict:
           if f in self.feat_index:
             x[self.feat_index[f]] = feat_dict[f]
         X.append(x)
       Xs.append(numpy.asarray(X))
     pred_ind_seqs = self.tagger.predict(Xs)
     preds = []
     for ps in pred_ind_seqs:
       pred = []
       for pred_ind in ps:
         pred.append(self.rev_label_index[pred_ind])
       preds.append(pred)
   return preds
Example #12
0
 def one_hot_encoding(self,features):
     encoding = ItemSequence(features[0])
     self.encoding = encoding.items()
     return self.encoding
Example #13
0
def _get_xseq(model, matrix):
    xseq = [{
        'feat{}'.format(i): float(w)
        for i, w in enumerate(list(features))
    } for features in model.predict(matrix)]
    return ItemSequence(xseq)
Example #14
0
def test_tag_item_sequence(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(ItemSequence(xseq)) == yseq
Example #15
0
 def feed_trainer(self, features_list, labels):
     features = ItemSequence(features_list)
     self.trainer.append(features, labels)
Example #16
0
 def tag_tweets(self, tweet_features_list):
     features = ItemSequence(tweet_features_list)
     labels = self.tagger.tag(features)
     return labels
Example #17
0
def sent2features(sent):
    return ItemSequence([word2features(sent, i) for i in range(len(sent))])