Exemple #1
0
def predict():
    string = str('test')
    hist_pred_n = string + "hist_pred.jpeg"

    # Loading from .pkl files
    pkl_hnd = store(app.config['static_path'], app.root_path)
    clf = pkl_hnd.load('model')
    n_labels = pkl_hnd.load('n_labels')
    enc = pkl_hnd.load('enc')

    # Feature extraction
    data = utils.file_parser_test(
        os.path.join(app.config['upload_path'], "test.txt"))
    features = utils.feature_extractor(data['text'], 5000)

    # Preprocessing features
    data_x = utils.preprocess_features(features, 2500)

    # Predicting
    pr = predict_model(data_x)
    pred_enc = pr.predict_model(clf)

    # Decoding the encoded prediction
    pred = utils.label_encoder(pred_enc, True, enc)
    pkl_hnd.save_pred(data_x, pred)
    # Saving predicted value and data into .csv file

    #Plotting histogram of prediction
    pkl_hnd.plot_hist(pred, hist_pred_n)

    return render_template(
        "predict_result.html",
        img_hist_pred=url_for(app.config['static_path'], filename=hist_pred_n),
    )
Exemple #2
0
def main():
    args = params()
    tag2id_path = os.path.join(args["output_path"], args["tag2id"])

    if not os.path.exists(args["output_path"]):
        os.makedirs(args["output_path"])
    if not os.path.join(args["pb_path"]):
        os.makedirs(args["pb_path"])
    tag2id = {"体育": 0, "健康": 1, "军事": 2, "教育": 3, "汽车": 4}
    max_len = args["max_len"]
    batch_size = args["batch_size"]
    epoch = args["epoch"]
    # load data
    data, label = load_data(args["data_file"], tag2id)
    logger.info("total data size: {}".format(len(data)))
    logger.info("total label size: {}".format(len(label)))
    # random 乱序
    data, label = random_shuffle(data, label)
    # save tag2id
    save_dict(tag2id, tag2id_path)
    # label encoder
    total_label = label_encoder(label, len(tag2id))

    # get train test data
    train_data, dev_data, train_label, dev_label = train_test_split(
        data, total_label, test_size=0.2)
    logger.info("train data size: {}".format(len(train_data)))
    logger.info("dev data size: {}".format(len(dev_data)))
    # bert tokenizer
    tokenizer = get_tokenizer()
    # tokenizer = get_roberta_tokenizer()
    # 准备模型数据
    train_x, train_y = create_inputs_targets(train_data, train_label, max_len,
                                             tokenizer)
    dev_x, dev_y = create_inputs_targets(dev_data, dev_label, max_len,
                                         tokenizer)

    # create model bert
    # model = create_model(len(tag2id))
    model = create_model(args["bert_model_name"], len(tag2id))
    # model.summary()
    model.fit(train_x,
              train_y,
              epochs=epoch,
              verbose=1,
              batch_size=batch_size,
              validation_data=(dev_x, dev_y),
              validation_batch_size=batch_size)  # , validation_split=0.1

    # model save
    model_path = os.path.join(args["output_path"], "classification_model.h5")
    model.save_weights(model_path, overwrite=True)

    # save pb model
    tf.keras.models.save_model(model,
                               args["pb_path"],
                               save_format="tf",
                               overwrite=True)
def get_train_test(path, num_rows=None):
    """Preprocess and extract features from application train and test files.

    Both files are combined in a single Dataframe for preprocessing, aggregation
    and feature engineering. This approach is NOT recommended on real-world models,
    however it improves the score in this competition since we can consider
    the test dataset features distribution.

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to load; None to read all (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    train = pd.read_csv(os.path.join(path, 'application_train.csv'), nrows=num_rows)
    test = pd.read_csv(os.path.join(path, 'application_test.csv'), nrows=num_rows)
    df = train.append(test)  # check function notes
    del train, test
    gc.collect()

    # Data cleaning
    df = df[df['CODE_GENDER'] != 'XNA']  # 4 people with XNA code gender
    df = df[df['AMT_INCOME_TOTAL'] < 20000000]  # Max income in test is 4M
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

    # Flag_document features - count and kurtosis
    docs = [f for f in df.columns if 'FLAG_DOC' in f]
    df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
    df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)
    # Categorical age - based on target plot
    df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: _get_age_label(x, [27, 40, 50, 65, 99]))

    # New features based on External sources
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3
    np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Credit ratios
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    # Income ratios
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
    # Time ratios
    df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']

    # Groupby 1: Statistics for applications with the same education, occupation and age range
    group = ['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_RANGE']
    df = utils.do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP1_EXT_SOURCES_MEDIAN')
    df = utils.do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP1_EXT_SOURCES_STD')
    df = utils.do_median(df, group, 'AMT_INCOME_TOTAL', 'GROUP1_INCOME_MEDIAN')
    df = utils.do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP1_INCOME_STD')
    df = utils.do_median(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP1_CREDIT_TO_ANNUITY_MEDIAN')
    df = utils.do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP1_CREDIT_TO_ANNUITY_STD')
    df = utils.do_median(df, group, 'AMT_CREDIT', 'GROUP1_CREDIT_MEDIAN')
    df = utils.do_std(df, group, 'AMT_CREDIT', 'GROUP1_CREDIT_STD')
    df = utils.do_median(df, group, 'AMT_ANNUITY', 'GROUP1_ANNUITY_MEDIAN')
    df = utils.do_std(df, group, 'AMT_ANNUITY', 'GROUP1_ANNUITY_STD')

    # Groupby 2: Statistics for applications with the same credit duration, income type and education
    df['CREDIT_TO_ANNUITY_GROUP'] = df['CREDIT_TO_ANNUITY_RATIO'].apply(lambda x: _group_credit_to_annuity(x))
    group = ['CREDIT_TO_ANNUITY_GROUP', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE']
    df = utils.do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP2_EXT_SOURCES_MEDIAN')
    df = utils.do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP2_EXT_SOURCES_STD')
    df = utils.do_median(df, group, 'AMT_INCOME_TOTAL', 'GROUP2_INCOME_MEDIAN')
    df = utils.do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP2_INCOME_STD')
    df = utils.do_median(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP2_CREDIT_TO_ANNUITY_MEDIAN')
    df = utils.do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP2_CREDIT_TO_ANNUITY_STD')
    df = utils.do_median(df, group, 'AMT_CREDIT', 'GROUP2_CREDIT_MEDIAN')
    df = utils.do_std(df, group, 'AMT_CREDIT', 'GROUP2_CREDIT_STD')
    df = utils.do_median(df, group, 'AMT_ANNUITY', 'GROUP2_ANNUITY_MEDIAN')
    df = utils.do_std(df, group, 'AMT_ANNUITY', 'GROUP2_ANNUITY_STD')

    # Encode categorical features (LabelEncoder)
    df, _ = utils.label_encoder(df, None)
    # Drop some features
    df = _drop_application_columns(df)
    return df
    Y = []
    for infile in glob.glob(args.inputdir + '/*/*/*'):
        dic = {}
        instance = os.path.split(os.path.dirname(infile))[-1]
        review_file = open(infile,'r').read()
        X.append(review_file)
        Y.append(instance)
        if instance not in d:
            d[instance] = []
        d[instance].append(review_file)
    X, _ = read_text(X)
    df = pd.DataFrame(X)
    df = df.fillna(0)
    original_author_names = Y.copy()

    Y =  label_encoder(Y)
    # Do what you need to read the documents here.

    print("Constructing table with {} feature dimensions and {}% test instances...".format(args.dims, args.testsize))
    # Build the table here.
    X = reduce_dim(df, args.dims)

    train_X, test_X, train_Y, test_Y, tag = shuffle_split(X, Y, test_split = args.testsize)
    train_X = pd.DataFrame(train_X)
    test_X = pd.DataFrame(test_X)
    train_Y = pd.DataFrame(train_Y)
    test_Y = pd.DataFrame(test_Y)
    full_dataset_X = pd.concat([train_X, test_X])
    full_dataset_Y = pd.concat([train_Y, test_Y])
    full_dataset_Y = full_dataset_Y.rename(columns = {0 : "labels"})
    combined_X_Y = pd.concat([full_dataset_X, full_dataset_Y], axis = 1)
Exemple #5
0
def main():
    args = model_params()
    tag2id_path = os.path.join(args["output_path"], args["tag2id"])

    if not os.path.exists(args["output_path"]):
        os.makedirs(args["output_path"])
    if not os.path.join(args["pb_path"]):
        os.makedirs(args["pb_path"])
    max_len = args["max_len"]
    batch_size = args["batch_size"]
    epoch = args["epoch"]
    # load data
    train_data, train_label_ori, tag2id, train_len = load_data(
        args["train_file"])
    print("train data size: ", len(train_data))
    print("train label size: ", len(train_label_ori))
    print("label dict: ", tag2id)
    dev_data, dev_label_ori, tag2id, dev_len = load_data(args["dev_file"])
    print("dev data size: ", len(dev_data))
    print("dev label size: ", len(dev_label_ori))
    print("label dict: ", tag2id)
    # load test data

    # save tag2id
    save_dict(tag2id, tag2id_path)
    # label encoder
    train_label = label_encoder(train_label_ori, tag2id)
    print("train label: ", train_label[:3])
    dev_label = label_encoder(dev_label_ori, tag2id)
    print("dev label: ", dev_label[:3])
    # get tokenizer
    # bert tokenizer
    tokenizer = get_tokenizer(args["pretrain_model_path"])
    # tokenizer = get_roberta_tokenizer()
    # 准备模型数据
    train_x, train_y = create_inputs_targets(train_data, train_label, tag2id,
                                             max_len, tokenizer)
    dev_x, dev_y = create_inputs_targets(dev_data, dev_label, tag2id, max_len,
                                         tokenizer)

    # create model bert
    model = create_model(args["pretrain_model_path"], len(tag2id),
                         args["dropout"])
    model.summary()
    model.fit(train_x,
              train_y,
              epochs=epoch,
              verbose=1,
              batch_size=batch_size,
              validation_data=(dev_x, dev_y),
              validation_batch_size=batch_size)  # , validation_split=0.1

    # model save
    model_file = os.path.join(args["output_path"], "ner_model.h5")
    model.save_weights(model_file, overwrite=True)

    # save pb model
    tf.keras.models.save_model(model, args["pb_path"], save_format="tf")

    # 模型评价
    precision, recall, f1 = model_evaluate(model, dev_x, dev_label_ori, tag2id,
                                           batch_size, dev_len)
    logger.info("model precision:{} recall:{} f1:{}".format(
        precision, recall, f1))
Exemple #6
0
def train():

    clf = request.form['train']
    if allowed_classifier(clf):
        string = str('train')
        hist_n = string + "hist.jpeg"
        cnmt_n = string + "cnmt.jpeg"
        pkl_hnd = store(app.config['static_path'], app.root_path)

        # Feature extraction
        data = utils.file_parser(
            os.path.join(app.config['upload_path'], "data.txt"))
        features = utils.feature_extractor(data['text'], 5000).todense()
        sh = data.shape

        # Preprocessing features and labels
        data_x = utils.preprocess_features(features, 2500)
        data_y, enc = utils.label_encoder(data['label'], False, None)
        pkl_hnd.dump(enc, 'enc')  # storing encoder

        # Splitting data into training set and validation set
        train_x, train_y, valid_x, valid_y = utils.train_valid(
            data_x, data_y, 0.2)

        #Balancing data with SMOTE
        text, label = utils.balance_data(train_x, train_y)

        # Selecting model and tuning hyperparameters
        tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y)
        comb_mod = tr.model_selection()

        # Fitting model and predicting
        mod = tr.build_model(comb_mod)
        pkl_hnd.dump(mod, 'model')  # storing the model
        pr = predict_model(valid_x)
        pred = pr.predict_model(mod)

        #Training Statistics
        st = stats(pred, valid_y)
        acc, f1 = st.train_stats()

        #Plotting histogram and confusion matrix
        pkl_hnd.plot_hist(data['label'], hist_n)
        n_labels = np.unique(np.asarray(data['label']))
        pkl_hnd.dump(n_labels, 'n_labels')  # storing labels
        cnf_matrix = st.cnf_mtx()
        pkl_hnd.plot_confusion_matrix(
            cnf_matrix,
            n_labels,
            cnmt_n,
            normalize=True,
            title='Confusion matrix',
            cmap=plt.cm.Blues,
        )

        return render_template("train_result.html",
                               accuracy=acc,
                               img_hist=url_for(app.config['static_path'],
                                                filename=hist_n),
                               img_cfmt=url_for(app.config['static_path'],
                                                filename=cnmt_n),
                               f1=f1)
    else:
        flash('Please enter a valid classifier')
        return redirect(url_for('index'))
def main():
    args = get_args()
    train_df = pd.read_csv(args["train_file"])
    train_df = shuffle(train_df)
    train_datas = train_df["content"].tolist()

    train_label_total = train_df["label"].unique().tolist()
    print("total data size: {}".format(len(train_datas)))
    # get lable dict
    label_list = read_dict(args["labeldict"])["label"]
    if not os.path.exists(args["labeldict"]):
        for label in train_label_total:
            if "|" in label:
                temp = label.split("|")
                for item in temp:
                    if item not in label_list:
                        label_list.append(item)
            else:
                if label not in label_list:
                    label_list.append(label)
        print("label cate size: {}".format(len(label_list)))
        label_dict = {"label": label_list}
        with open(args["labeldict"], "w", encoding="utf-8") as f:
            f.write(json.dumps(label_dict, ensure_ascii=False, indent=4))

    # label encoder
    train_labels = label_encoder(train_df["label"].tolist(), label_list)

    train_data, val_data, train_label, val_label = train_test_split(
        train_datas, train_labels, test_size=0.2, random_state=0)
    print("train data size: {}".format(len(train_data)))
    print("val data size: {}".format(len(val_data)))

    tokenizer = get_tokenizer(args["bert_model_name"],
                              args["pretrain_model_path"])

    train_x, train_y = get_model_data(train_data, train_label, tokenizer,
                                      args["max_length"])

    val_x, val_y = get_model_data(val_data, val_label, tokenizer,
                                  args["max_length"])
    model = create_model(args["bert_model_name"], len(label_list))

    if not os.path.exists(args["model_path"]):
        os.makedirs(args["model_path"])

    if not os.path.exists(args["pbmodel_path"]):
        os.makedirs(args["pbmodel_path"])

    # 设置保存最优的模型,保存的是pb模型
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            # Path where to save the model
            # The two parameters below mean that we will overwrite
            # the current checkpoint if and only if
            # the `val_loss` score has improved.
            # The saved model name will include the current epoch.
            filepath=args["model_path"],  # {epoch}
            save_best_only=True,  # Only save a model if `val_loss` has improved.
            monitor='val_auc',  # 'accuracy',
            verbose=1,
            mode='max')
    ]

    model.fit(train_x,
              train_y,
              epochs=args["epoch"],
              verbose=1,
              batch_size=args["batch_size"],
              callbacks=callbacks,
              validation_data=(val_x, val_y),
              validation_batch_size=args["batch_size"])

    model_path = os.path.join("./output/model/", "mulclassifition.h5")
    model.save_weights(model_path)

    tf.keras.models.save_model(model,
                               args["pbmodel_path"],
                               save_format="tf",
                               overwrite=True)
Exemple #8
0
def main():
    args = model_params()
    tag2id_path = os.path.join(args["output_path"], args["tag2id"])

    if not os.path.exists(args["output_path"]):
        os.makedirs(args["output_path"])
    if not os.path.join(args["pb_path"]):
        os.makedirs(args["pb_path"])
    max_len = args["max_len"]
    batch_size = args["batch_size"]
    epoch = args["epoch"]
    # load data
    train_data, train_label_ori, tag2id, train_len = load_data(args["train_file"])
    print("train data size: ", len(train_data))
    print("train label size: ", len(train_label_ori))
    print("label dict: ", tag2id)
    dev_data, dev_label_ori, _, dev_len = load_data(args["dev_file"])
    print("dev data size: ", len(dev_data))
    print("dev label size: ", len(dev_label_ori))

    # save tag2id
    save_dict(tag2id, tag2id_path)
    # label encoder
    train_label = label_encoder(train_label_ori, tag2id)
    print("train label: ", train_label[:3])
    dev_label = label_encoder(dev_label_ori, tag2id)
    print("dev label: ", dev_label[:3])
    # get tokenizer
    tokenizer = get_tokenizer(args["pretrain_model_path"])
    # tokenizer = get_roberta_tokenizer()
    # 准备模型数据
    train_x, train_y = create_inputs_targets_roberta(train_data, train_label,
                                                     tag2id, max_len, tokenizer)
    dev_x, dev_y = create_inputs_targets_roberta(dev_data, dev_label,
                                                 tag2id, max_len, tokenizer)

    # create model bert
    model = TFBertForTokenClassification.from_pretrained(args["pretrain_model_path"],
                                                         from_pt=True,
                                                         num_labels=len(list(tag2id.keys())))
    # optimizer Adam
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-08)
    # we do not have one-hot vectors, we can use sparse categorical cross entropy and accuracy
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    model.summary()
    model.fit(train_x,
              train_y,
              epochs=epoch,
              verbose=1,
              batch_size=batch_size,
              validation_data=(dev_x, dev_y),
              validation_batch_size=batch_size
             )   # , validation_split=0.1

    # model save
    model_file = os.path.join(args["output_path"], "ner_model.h5")
    model.save_weights(model_file, overwrite=True)

    # save pb model
    tf.keras.models.save_model(model, args["pb_path"], save_format="tf")

    # 模型评价
    precision, recall, f1 = model_evaluate_roberta(model, dev_x, dev_label_ori,
                                                   tag2id, batch_size, dev_len)
    logger.info("model precision:{} recall:{} f1:{}".format(precision, recall, f1))