Beispiel #1
0
def train_mentioned_model(train_data, train_segs, validate_data, validate_segs,
                          vectorizer, train_model):
    model_name = train_model[0]
    start = train_model[1]
    end = train_model[2]
    logger.info("start train %s mentioned", model_name)
    train_data_size = config.train_data_size
    sum_label_val = (end - start + 1) * 2
    column_list = range(start, end + 1)
    ori_labels = train_data.iloc[0:train_data_size, column_list]
    # convert labels ,
    # all the three labels equal -2 means mentioned this item,covert it to 1
    # else convert it to 0
    train_label = ori_labels.T.sum().abs() // sum_label_val
    logger.debug("begin to train data")
    cw = "balanced"
    mentioned_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
    mentioned_clf.fit(train_segs, train_label)
    logger.debug("begin to validate %s mentioned model", model_name)
    # load validate model
    ori_labels = validate_data.iloc[0:, column_list]
    validate_labels = ori_labels.T.sum().abs() // sum_label_val
    y_pre = mentioned_clf.predict(validate_segs)
    report(validate_labels, y_pre)
    score = f1_score(validate_labels, y_pre, average="macro")
    logger.info("validate done! %s mentioned model score:%s", model_name,
                str(score))

    if score > 0.8:
        logger.info("save %s mentioned model", model_name)
        model_save_path = config.model_save_path
        if not os.path.exists(model_save_path):
            os.makedirs(model_save_path)

        joblib.dump(mentioned_clf,
                    model_save_path + model_name + "_mentioned.pkl",
                    compress=3)
    return mentioned_clf
Beispiel #2
0
def train_specific_model(train_data):
    columns = train_data.columns.values.tolist()
    logger.debug("begin to seg train content")
    content_segments = seg_words(
        train_data.content.iloc[0:config.train_data_size])
    logger.debug("seg train content done")
    vectorizer = joblib.load(config.model_save_path + vec_name)
    logger.debug("load vectorizer")
    validate_data_df = load_data_from_csv(config.validate_data_path)
    validata_segs = seg_words(validate_data_df.content)
    logger.debug("seg validate content")
    scores = dict()
    for model_name in columns[:-1]:
        logger.info("begin to train %s model", model_name)
        cw = [{
            -2: a,
            -1: b,
            0: w,
            1: x
        } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12)
              for x in range(5, 8)]
        # cw = {0: 7, 1: 6, -1: 6, -2: 1}
        positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw)
        y_label = train_data[model_name].iloc[0:config.train_data_size]
        positive_clf.fit(content_segments, y_label)

        y_pre = positive_clf.predict(validata_segs)
        y_true = validate_data_df[model_name].iloc[0:]
        report(y_true, y_pre)
        score = f1_score(y_true, y_pre, average="macro")
        logger.info("score for model:%s is %s ", model_name, str(score))
        scores[model_name] = score
        joblib.dump(positive_clf,
                    config.model_save_path + model_name + ".pkl",
                    compress=True)
    score = np.mean(list(scores.values()))
    logger.info("f1_scores: %s" % score)
                 [train_label6[l] for l in range(len(config.class_group[5]))],
                 Validation_seq, [
                     Validation_label[config.class_group[5][i]]
                     for i in range(len(config.class_group[5]))
                 ])

    # 保存模型
    model1.save('6lstm_1.npy')
    model2.save('6lstm_2.npy')
    model3.save('6lstm_3.npy')
    model4.save('6lstm_4.npy')
    model5.save('6lstm_5.npy')
    model6.save('6lstm_6.npy')

    # 评估模型
    valid_pred1 = model1.predict(Validation_seq)
    valid_pred2 = model2.predict(Validation_seq)
    valid_pred3 = model3.predict(Validation_seq)
    valid_pred4 = model4.predict(Validation_seq)
    valid_pred5 = model5.predict(Validation_seq)
    valid_pred6 = model6.predict(Validation_seq)
    valid_pred = np.concatenate((valid_pred1, valid_pred2, valid_pred3,
                                 valid_pred4, valid_pred5, valid_pred6),
                                axis=0)

    y_pred = [np.argmax(valid_pred[i], axis=1) for i in range(20)]
    y_true = [np.argmax(Validation_label[i], axis=1) for i in range(20)]
    f1 = [
        metrics.f1_score(y_true[i], y_pred[i], average='micro')
        for i in range(len(valid_pred))
    ]