Ejemplos de models_eval en Python, ejemplos de nfold_train.models_eval en Python

Ejemplo n.º 1

0

Mostrar archivo

def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \
        scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None):
    tmp_model_dir = "./model_dir/"
    if not os.path.isdir(tmp_model_dir):
        os.makedirs(tmp_model_dir, exist_ok=True)
    if FLAGS.stacking:
        np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data)
        np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label)
        np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data)
    elif FLAGS.model_type == 'v':
        np.save(os.path.join(tmp_model_dir, "vae_data.npy"), stacking_data)
    else:
        # if FLAGS.load_stacking_data:
        #     sub2[coly] = sub_re
        # else:
        sub_re = pd.DataFrame(models_eval(models, test),columns=["target"],index=tid)
        sub_re["target"] = np.expm1(sub_re["target"].values)
        # sub_re["target"][leak_target.index] = leak_target
        # blend = sub2 #blend[sub2.columns]
        if FLAGS.predict_feature:
            time_label = "_" + col + time.strftime('_%Y_%m_%d_%H', time.gmtime())
            sub_name = tmp_model_dir + time_label + ".csv"
        elif FLAGS.aug_data:
            time_label = "_" + aug_data_target + time.strftime('_%Y_%m_%d_%H', time.gmtime())
            sub_name = tmp_model_dir + time_label + ".csv"
        else:
            time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
            sub_name = tmp_model_dir + "sub" + time_label + ".csv"
        sub_re.to_csv(sub_name)

        # save model to file
        for i, model in enumerate(models):
            if (model[1] == 'l'):
                model_name = tmp_model_dir + "model_" + str(i) + time_label + ".txt"
                model[0].save_model(model_name)
            elif (model[1] == 'k' or model[1] == 'r'):
                model_name = tmp_model_dir + "model_" + str(i) + time_label + ".h5"
                model[0].model.save(model_name)

        scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"])
        score_text_file = tmp_model_dir + "score_text" + time_label + ".csv"
        scores_text_frame.to_csv(score_text_file, index=False)
        scores = scores_text_frame["score_text"]
        for i in range(FLAGS.epochs):
            scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1]))
            print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \
                scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median()))

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName), FLAGS.output_model_path)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: main.py Proyecto: ifuding/TC

def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \
        scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None):
    tmp_model_dir = "./model_dir/"
    if not os.path.isdir(tmp_model_dir):
        os.makedirs(tmp_model_dir, exist_ok=True)
    if FLAGS.stacking:
        np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"),
                stacking_data)
        np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"),
                stacking_label)
        np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"),
                stacking_test_data)
    elif FLAGS.model_type == 'v':
        np.save(os.path.join(tmp_model_dir, "vae_data.npy"), stacking_data)
    else:
        sub_re = pd.DataFrame(models_eval(models, test), index=tid)
        time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
        sub_name = tmp_model_dir + "sub" + time_label + ".csv"
        sub_re.to_csv(sub_name)

        # save model to file
        for i, model in enumerate(models):
            if (model[1] == 'l'):
                model_name = tmp_model_dir + "model_" + str(
                    i) + time_label + ".txt"
                model[0].save_model(model_name)
            elif (model[1] == 'k' or model[1] == 'r'):
                model_name = tmp_model_dir + "model_" + str(
                    i) + time_label + ".h5"
                model[0].save(model_name)

        # scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"])
        score_text_file = tmp_model_dir + "score_text" + time_label + ".csv"
        scores_text_df = pd.concat(scores_text)
        scores_text_df.groupby(scores_text_df.index).agg(
            ['max', 'min', 'mean', 'median', 'std']).T.to_csv(score_text_file,
                                                              index=True)
        # scores = scores_text_frame["score_text"]
        # for i in range(FLAGS.epochs):
        #     scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1]))
        #     print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \
        #         scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median()))

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName),
                    FLAGS.output_model_path)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: main.py Proyecto: ifuding/Kaggle

def sub(mdoels, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \
        scores_text = None, tid = None, sub_re = None):
    tmp_model_dir = "./model_dir/"
    if not os.path.isdir(tmp_model_dir):
        os.makedirs(tmp_model_dir, exist_ok=True)
    if False:
        #FLAGS.stacking:
        np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"),
                stacking_data)
        np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"),
                stacking_label)
        np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"),
                stacking_test_data)
    else:
        # if FLAGS.load_stacking_data:
        #     sub2[coly] = sub_re
        # else:
        sub_re = pd.DataFrame(tid, columns=['click_id'])
        sub_re['is_attributed'] = models_eval(models, test)
        # sub2[c] = sub2[c].clip(0+1e12, 1-1e12)
        # blend = sub2 #blend[sub2.columns]
        time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
        sub_name = tmp_model_dir + "sub" + time_label + ".csv"
        sub_re.to_csv(sub_name, index=False)

        # save model to file
        if (models[0][1] == 'l'):
            model_name = tmp_model_dir + "model" + ".txt"
            models[0][0].save_model(model_name)
        elif (models[0][1] == 'k'):
            model_name = tmp_model_dir + "model" + ".h5"
            models[0][0].model.save(model_name)

        scores_text_frame = pd.DataFrame(scores_text, columns=["score_text"])
        score_text_file = tmp_model_dir + "score_text" + time_label + ".csv"
        scores_text_frame.to_csv(score_text_file, index=False)
        scores = scores_text_frame["score_text"]
        for i in range(FLAGS.epochs):
            scores_epoch = scores.loc[scores.str.startswith(
                'epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1]))
            print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \
                scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median()))

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName),
                    FLAGS.output_model_path)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: PoolGRU.py Proyecto: ifuding/Kaggle

def sub(mdoels, stacking_data = None, stacking_label = None, stacking_test_data = None, test = None, \
        scores_text = None, coly = None, tid = None, sub_re = None):
    tmp_model_dir = "./model_dir/"
    if not os.path.isdir(tmp_model_dir):
        os.makedirs(tmp_model_dir, exist_ok=True)
    if FLAGS.stacking:
        np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"),
                stacking_data)
        np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"),
                stacking_label)
        np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"),
                stacking_test_data)
    else:
        sub2 = pd.DataFrame(np.zeros((test.shape[0], len(coly))), columns=coly)
        if FLAGS.load_stacking_data:
            sub2[coly] = sub_re
        else:
            sub2[coly] = models_eval(models, test)
        sub2['id'] = tid
        for c in coly:
            sub2[c] = sub2[c].clip(0 + 1e12, 1 - 1e12)
        blend = sub2  #blend[sub2.columns]
        time_label = strftime('_%Y_%m_%d_%H_%M_%S', gmtime())
        sub_name = tmp_model_dir + "sub" + time_label + ".csv"
        blend.to_csv(sub_name, index=False)

        scores_text_frame = pd.DataFrame(scores_text, columns=["score_text"])
        score_text_file = tmp_model_dir + "score_text" + time_label + ".csv"
        scores_text_frame.to_csv(score_text_file, index=False)
        scores = scores_text_frame["score_text"]
        for i in range(FLAGS.epochs):
            scores_epoch = scores.loc[scores.str.startswith(
                'epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1]))
            print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \
                scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median()))

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName),
                    FLAGS.output_model_path)

Ejemplo n.º 5

0

Mostrar archivo

 def train_sub(col):
     scores_text = []
     aug_data_target = None
     if FLAGS.aug_data:
         aug_data_target = 'pred_nz_min'
         train_data, train_label, test_data, tid, valide_data, valide_label, weight, leak_target = LoadAugDdata(aug_data_target)
     else:
         train_data, train_label, test_data, tid, valide_data, valide_label, weight, leak_target = load_data(col)
     if not FLAGS.load_stacking_data:
         models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label, flags = FLAGS, \
                 model_types = [FLAGS.model_type], scores = scores_text, test_data = test_data, \
                 valide_data = valide_data, valide_label = valide_label, cat_max = None, emb_weight = None, leak_target = leak_target)
     else:
         for i in range(train_label.shape[1]):
             models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label[:, i], flags = FLAGS, \
                 model_types = [FLAGS.model_type], scores = scores_text, emb_weight = emb_weight, test_data = test_data \
                 # , valide_data = train_data[:100], valide_label = train_label[:100, i]
                 )
             sub_re[:, i] = models_eval(models, test_data)
     sub(models, stacking_data = stacking_data, stacking_label = stacking_label, stacking_test_data = stacking_test_data, \
         test = test_data, scores_text = scores_text, tid = tid, col = col, leak_target = leak_target, aug_data_target = aug_data_target)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: main.py Proyecto: ifuding/Kaggle

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName),
                    FLAGS.output_model_path)


if __name__ == "__main__":
    scores_text = []
    train_data, train_label, test_data, tid, valide_data, valide_label, weight = load_data(
    )
    if not FLAGS.load_only_singleCnt and FLAGS.model_type == 'k':
        test_data = list(test_data.transpose())
    if not FLAGS.load_stacking_data:
        models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label, flags = FLAGS, \
                model_types = [FLAGS.model_type], scores = scores_text, test_data = test_data, \
                valide_data = valide_data, valide_label = valide_label)
    else:
        for i in range(train_label.shape[1]):
            models, stacking_data, stacking_label, stacking_test_data = nfold_train(train_data, train_label[:, i], flags = FLAGS, \
                model_types = [FLAGS.model_type], scores = scores_text, emb_weight = emb_weight, test_data = test_data \
                # , valide_data = train_data[:100], valide_label = train_label[:100, i]

                )
            sub_re[:, i] = models_eval(models, test_data)
    sub(models, stacking_data = stacking_data, stacking_label = stacking_label, stacking_test_data = stacking_test_data, \
            test = test_data, scores_text = scores_text, tid = tid)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: ToxicAvenger.py Proyecto: ifuding/Kaggle

    train_label,
    flags=FLAGS,
    model_types=['cnn'],
    tokenizer=tokenizer
)  #, valide_data = train_data, valide_label = train_label)
# exit(0)
# for c in coly:
#     print("------Label: {0}".format(c))
#     label = train_label[c].values
#     models, _, _, _ = nfold_train(train_data, label, fold = 5, model_types = ['k']) #, valide_label = train_label)
#     multi_label_models.append(models)
#     sub2[c] = models_eval(models, test_data)
#model = ensemble.ExtraTreesClassifier(n_jobs=-1, random_state=3)
#model.fit(data[:nrow], y[:nrow])
# print(1- model.score(data[:nrow], y[:nrow]))
sub2[coly] = models_eval(models, test_data)
# sub2 = pd.DataFrame([[c[1] for c in sub2[row]] for row in range(len(sub2))]).T
# sub2.columns = coly
sub2['id'] = tid
for c in coly:
    sub2[c] = sub2[c].clip(0 + 1e12, 1 - 1e12)
''' #blend 1
sub2.columns = [x+'_' if x not in ['id'] else x for x in sub2.columns]
blend = pd.merge(sub1, sub2, how='left', on='id')
for c in coly:
    blend[c] = blend[c] * 0.8 + blend[c+'_'] * 0.2
    blend[c] = blend[c].clip(0+1e12, 1-1e12)
blend = blend[sub1.columns]

#blend 2
sub2 = blend[:]

Ejemplo n.º 8

0

Mostrar archivo

Archivo: main.py Proyecto: ifuding/TC

def sub(models, stacking_data = None, stacking_label = None, stacking_test_data = None, test_data = None, \
        scores_text = None, tid = None, sub_re = None, col = None, leak_target = None, aug_data_target = None, \
        train_part_img_id = None, validate_part_img_id = None, train_data = None):
    tmp_model_dir = "./model_dir/"
    time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
    # tmp_model_dir = "./model_dir/" + time_label
    if not os.path.isdir(tmp_model_dir):
        os.makedirs(tmp_model_dir, exist_ok=True)
    if FLAGS.stacking:
        # np.save(os.path.join(tmp_model_dir, "stacking_train_data.npy"), stacking_data)
        # np.save(os.path.join(tmp_model_dir, "stacking_train_label.npy"), stacking_label)
        # np.save(os.path.join(tmp_model_dir, "stacking_test_data.npy"), stacking_test_data)
        # stacking_data.to_csv(tmp_model_dir + '/stacking_train_data' + time_label + '.csv', index = False)
        # stacking_label.to_csv(tmp_model_dir + '/stacking_train_label' + time_label + '.csv', index = False)
        with open(
                tmp_model_dir + '/stacking_train_data' + time_label +
                '.pickle', 'wb+') as handle:
            pickle.dump(stacking_data, handle)
        with open(
                tmp_model_dir + '/stacking_train_label' + time_label +
                '.pickle', 'wb+') as handle:
            pickle.dump(stacking_label, handle)
    elif FLAGS.predict:
        with open(tmp_model_dir + '/train_data' + time_label + '.pickle',
                  'wb+') as handle:
            pickle.dump(stacking_data, handle)
        with open(tmp_model_dir + '/test_data' + time_label + '.pickle',
                  'wb+') as handle:
            pickle.dump(stacking_test_data, handle)
    else:
        # pass
        flat_models = [
            (Model(inputs=m[0].model.inputs,
                   outputs=m[0].model.get_layer(name='avg_pool').output), 'k')
            for m in models
        ]
        flat_train_re = models_eval(flat_models,
                                    preprocess_img(train_data['img']))
        flat_test_re = models_eval(flat_models,
                                   preprocess_img(test_data['img']))
        with open(tmp_model_dir + '/flat_train_re' + time_label + '.pickle',
                  'wb+') as handle:
            pickle.dump(flat_train_re, handle)
        with open(tmp_model_dir + '/flat_test_re' + time_label + '.pickle',
                  'wb+') as handle:
            pickle.dump(flat_test_re, handle)
        # save model to file
        for i, model in enumerate(models):
            if (model[1] == 'l'):
                model_name = tmp_model_dir + "model_" + str(
                    i) + time_label + ".txt"
                model[0].save_model(model_name)
            elif (model[1] == 'k' or model[1] == 'r'):
                model_name = tmp_model_dir + "model_" + str(
                    i) + time_label + ".h5"
                model[0].model.save(model_name)
                train_part_img_id[i].to_csv(
                    tmp_model_dir + 'train_part_img_id_' + str(i) + '.csv',
                    index=False)
                validate_part_img_id[i].to_csv(
                    tmp_model_dir + 'validate_part_img_id_' + str(i) + '.csv',
                    index=False)

            # scores_text_frame = pd.DataFrame(scores_text, columns = ["score_text"])
            score_text_file = tmp_model_dir + "score_text" + time_label + ".csv"
            scores_text_df = pd.concat(scores_text)
            scores_text_df.groupby(scores_text_df.index).agg(
                ['max', 'min', 'mean', 'median',
                 'std']).T.to_csv(score_text_file, index=True)
            # scores = scores_text_frame["score_text"]
            # for i in range(FLAGS.epochs):
            #     scores_epoch = scores.loc[scores.str.startswith('epoch:{0}'.format(i + 1))].map(lambda s: float(s.split()[1]))
            #     print ("Epoch{0} mean:{1} std:{2} min:{3} max:{4} median:{5}".format(i + 1, \
            #         scores_epoch.mean(), scores_epoch.std(), scores_epoch.min(), scores_epoch.max(), scores_epoch.median()))

    if not os.path.isdir(FLAGS.output_model_path):
        os.makedirs(FLAGS.output_model_path, exist_ok=True)
    for fileName in os.listdir(tmp_model_dir):
        dst_file = os.path.join(FLAGS.output_model_path, fileName)
        if os.path.exists(dst_file):
            os.remove(dst_file)
        shutil.move(os.path.join(tmp_model_dir, fileName),
                    FLAGS.output_model_path)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: main.py Proyecto: ifuding/Kaggle

def load_data():
    print("\nData Load Stage")
    if FLAGS.debug:
        nrow = 10000
    else:
        nrow = None
    training = pd.read_csv(path + '/train.csv',
                           index_col="item_id",
                           parse_dates=["activation_date"],
                           nrows=nrow)
    traindex = training.index
    testing = pd.read_csv(path + '/test.csv',
                          index_col="item_id",
                          parse_dates=["activation_date"],
                          nrows=nrow)
    testdex = testing.index

    ntrain = training.shape[0]
    ntest = testing.shape[0]

    y = training.deal_probability.copy()
    training.drop("deal_probability", axis=1, inplace=True)
    print('Train shape: {} Rows, {} Columns'.format(*training.shape))
    print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

    print("Combine Train and Test")
    df = pd.concat([training, testing], axis=0)
    del training, testing
    gc.collect()
    print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

    print("Feature Engineering")
    df["price"] = np.log(df["price"] + 0.001)
    df["price"].fillna(-999, inplace=True)
    df["image_top_1"].fillna(-999, inplace=True)

    print("\nCreate Time Variables")
    df["Weekday"] = df['activation_date'].dt.weekday
    df["WeekdOfYear"] = df['activation_date'].dt.week
    df["DayOfMonth"] = df['activation_date'].dt.day

    # Create Validation Index and Remove Dead Variables
    # training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
    # validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index
    df.drop(["activation_date", "image"], axis=1, inplace=True)

    print("\nEncode Variables")
    categorical = [
        "user_id", "region", "city", "parent_category_name", "category_name",
        "user_type", "image_top_1", "param_1", "param_2", "param_3"
    ]
    print("Encoding :", categorical)

    # Encoder:
    lbl = preprocessing.LabelEncoder()
    for col in categorical:
        df[col].fillna('Unknown')
        df[col] = lbl.fit_transform(df[col].astype(str))
    cat_max = df[keras_train.USED_CATEGORY_FEATURES].max().astype('int64')
    print(cat_max)

    textfeats = ["description", "title"]
    # print(df.head)
    # exit(0)

    if FLAGS.lgb_boost_dnn:
        models = []
        for i in range(FLAGS.lgb_ensemble_nfold):
            bst = lgb.Booster(model_file=FLAGS.input_previous_model_path +
                              '/model_' + str(0) + '_2018_05_31_04_04_47.txt')
            models.append((bst, 'l'))
        df['lgb_pred'] = models_eval(models, df)
        keras_train.USED_FEATURE_LIST += ['lgb_pred']

    emb_weight = None
    if FLAGS.model_type == 'k':
        print('Tokenizer...')
        for cols in textfeats:
            df[cols] = df[cols].astype(str).fillna('missing')  # FILL NA
        data = df[textfeats].apply(lambda x: ' '.join(x), axis=1).values
        tokenizer = Tokenizer(num_words=FLAGS.vocab_size)
        tokenizer.fit_on_texts(data)
        for i, cols in enumerate(textfeats):
            data = pad_sequences(tokenizer.texts_to_sequences(df[cols]),
                                 maxlen=FLAGS.max_len[i])
            df[cols] = data.tolist()

        if FLAGS.load_wv_model:
            emb_weight = get_word2vec_embedding(location = FLAGS.input_training_data_path + FLAGS.wv_model_file, \
                    tokenizer = tokenizer, nb_words = FLAGS.vocab_size, embed_size = FLAGS.gram_embedding_dim, \
                    model_type = FLAGS.wv_model_type, uniform_init_emb = FLAGS.uniform_init_emb)
        else:
            if FLAGS.uniform_init_emb:
                emb_weight = np.random.uniform(
                    0, 1, (FLAGS.vocab_size, FLAGS.emb_dim))
            else:
                emb_weight = np.zeros((FLAGS.vocab_size, FLAGS.emb_dim))

    # df.drop(textfeats, axis=1,inplace=True)
    print(df.info())
    # df.to_pickle('lgb_pred.pickle')
    # exit(0)

    train_data = df.loc[traindex, keras_train.USED_FEATURE_LIST]
    train_label = y.values
    test_data = df.loc[testdex, keras_train.USED_FEATURE_LIST]
    test_id = testdex
    valide_data = None
    valide_label = None
    weight = None
    return train_data, train_label, test_data, test_id, valide_data, valide_label, weight, cat_max, emb_weight