Example #1
0
def infer_classic(model_save_path,
                  test_data_path,
                  thresholds=0.5,
                  pred_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  num_classes=2,
                  feature_type='tf'):
    # load model
    model = load_pkl(model_save_path)
    # load data content
    data_set, test_ids = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()

    if num_classes == 2:
        # binary classification
        label_pred_probas = model.predict_proba(data_feature)[:, 1]
        label_pred = label_pred_probas > thresholds
    else:
        label_pred = model.predict(data_feature)
    save(label_pred, test_ids, pred_save_path)
    print("finish prediction.")
Example #2
0
def train_xgboost_lr(data_path,
                     vectorizer_path=None,
                     xgblr_xgb_model_path=None,
                     xgblr_lr_model_path=None,
                     feature_encoder_path=None,
                     feature_type='tfidf_char',
                     col_sep='\t'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path,
                  feature_encoder_path)
    # fit
    model.train_model(X_train, y_train)
    # evaluate
    label_pred = model.predict(X_val)
    simple_evaluate(y_val, label_pred)
Example #3
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data=data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)

    # evaluate
    if true_labels:
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
        except Exception:
            print("error. no true labels")

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight_dict = load_dict(config.lr_feature_weight_path)
        pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict,
                                                id_label)
        print(pred_labels[:5])
Example #4
0
def infer_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     label_vocab_path='',
                     max_len=300,
                     batch_size=128,
                     col_sep='\t',
                     pred_save_path=None):
    from keras.models import load_model
    # load data content
    data_set, true_labels = data_reader(data_path, col_sep)
    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        feature_type = 'doc_vectorize'
    else:
        feature_type = 'vectorize'
    feature = Feature(data_set,
                      feature_type=feature_type,
                      is_infer=True,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    # load model
    model = load_model(model_save_path)
    # predict, in keras, predict_proba same with predict
    pred_label_probs = model.predict(data_feature, batch_size=batch_size)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}
    pred_labels = [prob.argmax() for prob in pred_label_probs]
    pred_labels = [id_label[i] for i in pred_labels]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to: %s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)
    if true_labels:
        # evaluate
        assert len(pred_labels) == len(true_labels)
        for label, prob in zip(true_labels, pred_label_probs):
            logger.debug('label_true:%s\tprob_label:%s\tprob:%s' %
                         (label, id_label[prob.argmax()], prob.max()))

        print('total eval:')
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
Example #5
0
 def _get_feature(self, word_vocab):
     # 提取特征
     print("feature_type : %s" % self.feature_type)
     print("seg_contents:")
     print(self.seg_contents[:2])
     feature = Feature(data=self.seg_contents,
                       feature_type=self.feature_type,
                       feature_vec_path=self.feature_vec_path,
                       word_vocab=word_vocab)
     # get data feature
     return feature.get_feature()
Example #6
0
def infer_xgboost_lr(test_data_path,
                     vectorizer_path=None,
                     xgblr_xgb_model_path=None,
                     xgblr_lr_model_path=None,
                     feature_encoder_path=None,
                     col_sep='\t',
                     pred_save_path=None,
                     feature_type='tfidf_char'):
    # load data content
    data_set, test_ids = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    model = XGBLR(xgblr_xgb_model_path, xgblr_lr_model_path,
                  feature_encoder_path)
    # predict
    label_pred = model.predict(data_feature)
    save(label_pred, test_ids, pred_save_path)
    print("finish prediction.")
Example #7
0
def train_classic(model_type,
                  data_path=None,
                  pr_figure_path=None,
                  model_save_path=None,
                  vectorizer_path=None,
                  col_sep=',',
                  thresholds=0.5,
                  num_classes=2,
                  feature_type='tfidf_char'):
    data_content, data_lbl = data_reader(data_path, col_sep)
    # init feature
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=vectorizer_path)
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    dump_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model,
         X_val,
         y_val,
         thresholds=thresholds,
         num_classes=num_classes,
         model_type=model_type,
         pr_figure_path=pr_figure_path)
Example #8
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
Example #9
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=512,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = cnn_model(max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          num_classses=num_classes,
                          dropout=dropout)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    cp = ModelCheckpoint(model_save_path,
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True)
    # fit and save model
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=nb_epoch,
                        validation_data=(X_val, y_val),
                        callbacks=[cp])
    logger.info('save model:%s' % model_save_path)
    plt_history(history, model_name=model_type)
Example #10
0
    plt.legend(loc="best")
    plt.savefig(figure_path)
    return plt


if __name__ == "__main__":
    sys.path.append("../")
    from models.feature import Feature
    from models.reader import data_reader

    data_content, data_lbl = data_reader('../data/train_words.txt', '\t')
    # init feature
    feature = Feature(feature_type='tfidf_word',
                      feature_vec_path='../output/temp')
    # get data feature
    data_feature = feature.get_feature(data_content)
    # label
    data_label = feature.label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.2)
    search_cv(X_train, y_train, X_val, y_val, model=SVC())

    # test plot_learning_curve
    title = "Learning Curves (Random Forest, n_estimators = 30)"
    estimator = SVC()
    plot_learning_curve(estimator,
                        title,
                        X_train,
                        y_train,
                        cv=5,
Example #11
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if 'logistic_regression' in model_save_path and config.is_debug:
        count = 0
        features = load_pkl('output/lr_features.pkl')
        for line in data_set:
            if count > 5:
                break
            count += 1
            logger.debug(line)
            words = line.split()
            for category, category_feature in features.items():
                logger.debug('*' * 43)
                logger.debug(category)
                category_score = 0
                for w in words:
                    if w in category_feature:
                        category_score += category_feature[w]
                        logger.debug("%s:%s" % (w, category_feature[w]))
                logger.debug("%s\t%f" % (category, category_score))
                logger.debug('=' * 43)
    if true_labels:
        # evaluate
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
Example #12
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=2,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split(" "))

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'

    word_dic = {}
    count = 1
    for word in word_vocab:
        word_dic[word] = count
        count += 1
    data_filter = []
    for line in data_content:
        line_filter = " ".join(
            list(filter(lambda x: x in word_dic, line.split(" "))))
        data_filter.append(line_filter)
    feature = Feature(data=data_filter,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = load_model(model_save_path)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    #loss,accuracy = model.evaluate(X_val,y_val)
    #print loss,accuracy
    pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None)
    print(y_val)
    print(type(y_val))
    with open("./output/result", "w") as f:
        for i in range(len(y_val)):
            f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2]))
    f.close()
Example #13
0
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.savefig(figure_path)
    return plt


if __name__ == "__main__":
    from models.feature import Feature
    from models.reader import data_reader

    data_content, data_lbl = data_reader('../data/training_new_seg_10k.txt', '\t')
    # init feature
    feature = Feature(data_content, feature_type='tfidf_word', feature_vec_path='../output/temp')
    # get data feature
    data_feature = feature.get_feature()
    # label
    data_label = feature.label_encoder(data_lbl)
    X_train, X_val, y_train, y_val = train_test_split(
        data_feature, data_label, test_size=0.2)
    search_cv(X_train, y_train, X_val, y_val, model=SVC())

    # test plot_learning_curve
    title = "Learning Curves (Random Forest, n_estimators = 30)"
    estimator = SVC()
    plot_learning_curve(estimator, title, X_train, y_train, cv=5, n_jobs=4,
                        figure_path='../output/curve.png')
    plt.show()
Example #14
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type))
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    word_id = load_vocab(word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    print(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)
    logger.info('data size:%d' % len(data_content))
    logger.info('label size:%d' % len(data_lbl))

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.error('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content, feature_type=feature_type,
                      feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(
        data_feature, data_label, test_size=0.1, random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        save_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight = {}
        word_dict_rev = sorted(word_id.items(), key=lambda x: x[1])
        for feature, index in word_dict_rev:
            feature_weight[feature] = list(map(float, model.coef_[:, index]))
        save_dict(feature_weight, config.lr_feature_weight_path)