Example #1
0
def nb():
    train_data = load_data.load_train_data()
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
    text_clf = text_clf.fit(train_data.get_data(), train_data.get_target())
    save_model(text_clf, config.SAVE_NB_PATH)
Example #2
0
def predict():
    word_weights, tag_weights = load_embedding()
    word_voc, tag_voc, label_voc = load_voc()

    # train data
    sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc)
    seed = 137
    np.random.seed(seed)
    np.random.shuffle(sentences)
    np.random.seed(seed)
    np.random.shuffle(tags)
    np.random.seed(seed)
    np.random.shuffle(labels)

    # load data
    sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc)
    labels_test = None
    
    # clear reslut
    command = 'rm ./Data/result/*'
    os.popen(command)

    # 划分训练、开发、测试集
    kf = KFold(n_splits=config.KFOLD)
    train_indices, dev_indices = [], []
    for train_index, dev_index in kf.split(labels):
        train_indices.append(train_index)
        dev_indices.append(dev_index)
    for num in range(config.KFOLD):
        train_index, dev_index = train_indices[num], dev_indices[num]
        sentences_train, sentences_dev = sentences[train_index], sentences[dev_index]
        tags_train, tags_dev = tags[train_index], tags[dev_index]
        labels_train, labels_dev = labels[train_index], labels[dev_index]

        # init model
        model = DCModel(
            config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt',
            label_voc=label_voc)

        # fit model
        model.fit(
            sentences_train, tags_train, labels_train,
            sentences_dev, tags_dev, labels_dev,
            sentences_test, tags_test, labels_test,
            config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB,
            word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB)
        print(model.get_best_score())
        [p_test, r_test, f_test], nb_epoch = model.get_best_score()
        command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num)
        print(command)
        os.popen(command)
        print(p_test, r_test, f_test, '\n')
        # evaluate
        # result_path_k = result_path % k
        # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test,
        #    labels_test, simple_compute=False, ignore_label=IGNORE_LABEL,
        #    label_voc=relation_voc, result_path=result_path_k)
        # clear model
        model.clear_model()
        del model
Example #3
0
def train_no_pruning(model, epochs, device):
    trainloader = load_train_data(batch_size=4)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    plot_data = train(trainloader, epochs, model, optimizer, criterion, device)
    return plot_data
Example #4
0
def load():
    logger.info('load start')

    x_train, y_train, cv = load_train_data()
    logger.info('merges')

    id_cols = [
        col for col in x_train.columns.values
        if re.search('_id$', col) is not None and col not in set(
            ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id'])
    ]
    logger.debug('id_cols {}'.format(id_cols))
    x_train.drop(id_cols, axis=1, inplace=True)

    dropcols = sorted(
        list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE)))
    x_train.drop(dropcols, axis=1, inplace=True)
    logger.info('drop')

    gc.collect()
    """
    # x_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    usecols = x_train.columns.values

    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)

    fillna_mean = x_train.mean()
    with open(DIR + 'fillna_mean.pkl', 'wb') as f:
        pickle.dump(fillna_mean, f, -1)
    x_train.fillna(fillna_mean, inplace=True)
    """
    return x_train, y_train, cv
Example #5
0
def main():
    train_file              = "data_train.txt"
    test_file               = "data_test.txt"
    epoches                 = 100
    alpha                   = 0.000000001
    data_array, label_array = load_train_data(train_file)
    test_array              = load_test_data(test_file)
    data_matrix             = np.mat(data_array)
    label_matrix            = np.mat(label_array)
    test_matrix             = np.mat(test_array)
    theta, cost_vector      = train(data_matrix, label_matrix, epoches, alpha)
    test_result             = test(theta, test_matrix)
    print(theta)
    print(cost_vector[np.size(cost_vector)-1])
    print(test_matrix, test_result)

    # Plot Result
    m,n       = np.shape(data_array)
    plot_x = []
    plot_y = []
    plot_z = []

    for i in range(m):
        plot_x.append(data_matrix[i,1])
        plot_y.append(data_matrix[i,n-1])
        plot_z.append(label_matrix[i,0])

    test_m, test_n       = np.shape(test_matrix)
    plot_testx = []
    plot_testy = []
    plot_testz = []
    for i in range(test_m):
        plot_testx.append(test_matrix[i,1])
        plot_testy.append(test_matrix[i,test_n-1])
        plot_testz.append(test_result[i,0])
    
    figure = plt.figure("Result")
    fig_plot = figure.add_subplot(111, projection='3d')
    fig_plot.scatter(plot_x, plot_y, plot_z, s=5, c='red', marker='s') # plot 0
    fig_plot.scatter(plot_testx, plot_testy, plot_testz, s=30, c='green', marker='s') # plot 0
    x = np.random.randint(1000, 5000, size=[10000])
    y = np.random.randint(2, 5, size=[10000])
    z = theta[0,0] + theta[1,0] * x + theta[2,0] * y
    fig_plot.plot(x,y,z)
    fig_plot.set_title("The Result Linear Regression")
    fig_plot.set_xlabel('Area')
    fig_plot.set_ylabel('Rooms')
    fig_plot.set_zlabel('Price')

    # Plot Cost
    cost_fig  = plt.figure("Cost")
    cost_plot = cost_fig.add_subplot(111)
    epoch   = np.arange(0, epoches+1, 1)
    cost_plot.plot(epoch, cost_vector)
    plt.title("The Cost")
    plt.xlabel('Epoch')
    plt.ylabel('Cost')

    plt.show()
Example #6
0
def load():
    logger.info('load start')
    x_train, y_train, cv = load_train_data()

    logger.info('merges')
    # x_train['stack1'] = get_stack('result_0727/')
    # init_score = np.log(init_score / (1 - init_score))

    id_cols = [
        col for col in x_train.columns.values
        if re.search('_id$', col) is not None and col not in set(
            ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id'])
    ]
    logger.debug('id_cols {}'.format(id_cols))
    x_train.drop(id_cols, axis=1, inplace=True)

    dropcols = sorted(
        list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE)))
    x_train.drop(dropcols, axis=1, inplace=True)
    logger.info('drop')

    x_train = x_train.merge(pd.read_csv('user_reorder_item_num.csv').astype(
        np.float32).rename(columns={'user_id': 'o_user_id'}),
                            how='left',
                            on='o_user_id',
                            copy=False)

    x_train = x_train.merge(pd.read_csv('item_reorder_user_num.csv').astype(
        np.float32).rename(columns={'product_id': 'o_product_id'}),
                            how='left',
                            on='o_product_id',
                            copy=False)

    x_train = x_train.merge(
        pd.read_csv('item_reorder_user_num_train.csv').astype(
            np.float32).rename(columns={'product_id': 'o_product_id'}),
        how='left',
        on='o_product_id',
        copy=False)

    x_train = x_train.merge(pd.read_csv('item_reorder_train.csv').astype(
        np.float32).rename(columns={'product_id': 'o_product_id'}),
                            how='left',
                            on='o_product_id',
                            copy=False)

    gc.collect()

    # x_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    usecols = x_train.columns.values

    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)

    fillna_mean = x_train.mean()
    with open(DIR + 'fillna_mean.pkl', 'wb') as f:
        pickle.dump(fillna_mean, f, -1)
    x_train.fillna(fillna_mean, inplace=True)
    return x_train, y_train, cv
Example #7
0
def iterative_pruning(model, iters, epochs, device):
    trainloader = load_train_data(batch_size=4)
    criterion = nn.CrossEntropyLoss()

    for iter in tqdm(range(iters)):
        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
        plot_data = train(trainloader, epochs, model, optimizer, criterion, device)
        torch.save(plot_data, "plots/%.3f" % ((0.8 ** iter)*100))
        model.prune_net(20)
        model.reinit_net()
Example #8
0
def train():
    # Load input data and label
    # x: input;     y: label
    x_train, y_train, x_eval, y_eval = load_train_data(in_height, in_width,
                                                       num_rows, train_ratio)

    model = build_model()

    # Define the input function for training
    input_fn_t = tf.estimator.inputs.numpy_input_fn(x={'file': x_train},
                                                    y=y_train,
                                                    batch_size=batch_size,
                                                    num_epochs=train_epoch,
                                                    shuffle=True)

    # Train the Model
    model.train(input_fn_t, steps=num_steps)

    # Define the input function for evaluating
    input_fn_e = tf.estimator.inputs.numpy_input_fn(x={'file': x_eval},
                                                    y=y_eval,
                                                    batch_size=batch_size,
                                                    shuffle=False)

    # Evaluate the Model
    e = model.evaluate(input_fn_e)
    total_steps = e['global_step']
    print('Evaluation Accuracy = ', e['accuracy'], "Loss = ", e['loss'],
          "global_step = ", total_steps)

    # Evaluate Checkpoints
    # all checkpoints have to be saved locally for evaluation, otherwise the evaluation is skipped
    # max_ckpt and ckpt_steps need to be properly tuned to enable checkpoints evaluation
    total_ckpts = total_steps // ckpt_steps
    print("Total number of checkpoints required = ", total_ckpts)

    if total_ckpts <= max_ckpt:  # all checkpoints are saved
        eval_results = np.zeros((total_ckpts, 3))

        for i in range(total_ckpts):
            j = np.min([(i + 1) * ckpt_steps + 1, total_steps])
            ckpt_path = './model/model.ckpt-' + str(j)
            print(ckpt_path)
            e = model.evaluate(input_fn_e, checkpoint_path=ckpt_path)
            eval_results[i, :] = [j, e['accuracy'], e['loss']]

        df = pd.DataFrame(eval_results)
        header = ["step", "accuracy", "loss"]
        df.to_csv('./eval_ckpts.csv', header=header, index=None)
        print(
            "Checkpoints Evaluation is completed. The results can be found at ./eval_ckpts.csv"
        )
    else:
        print("Checkpoints Evaluation is skipped.")
Example #9
0
def svm():
    train_data = load_data.load_train_data()
    text_clf_svm = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf-svm',
                              SGDClassifier(loss='hinge',
                                            penalty='l2',
                                            alpha=1e-3,
                                            n_iter=5,
                                            random_state=42))])
    text_clf_svm.fit(train_data.get_data(), train_data.get_target())
    save_model(text_clf_svm, config.SAVE_SVM_PATH)
Example #10
0
def load():
    logger.info('load start')
    x_train, y_train, cv = load_train_data()

    logger.info('merges')
    #x_train['stack1'] = get_stack('result_0727/')
    #init_score = np.log(init_score / (1 - init_score))

    id_cols = [
        col for col in x_train.columns.values
        if re.search('_id$', col) is not None and col not in set(
            ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id'])
    ]
    logger.debug('id_cols {}'.format(id_cols))
    x_train.drop(id_cols, axis=1, inplace=True)

    dropcols = sorted(
        list(set(x_train.columns.values.tolist()) & set(DROP_FEATURE)))
    x_train.drop(dropcols, axis=1, inplace=True)
    logger.info('drop')
    #cols_ = pd.read_csv('result_0728_18000/feature_importances.csv')
    #cols_ = cols_[cols_.imp == 0]['col'].values.tolist()
    #cols_ = cols_['col'].values.tolist()[250:]
    #dropcols = sorted(list(set(x_train.columns.values.tolist()) & set(cols_)))
    #x_train.drop(dropcols, axis=1, inplace=True)

    #imp = pd.read_csv('result_0731_xentropy/feature_importances.csv')['col'].values
    #x_train = x_train[imp]

    usecols = x_train.columns.values
    #logger.debug('all_cols {}'.format(usecols))
    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)
    gc.collect()

    #x_train.replace([np.inf, -np.inf], np.nan, inplace=True)

    fillna_mean = x_train.mean()
    with open(DIR + 'fillna_mean.pkl', 'wb') as f:
        pickle.dump(fillna_mean, f, -1)
    x_train.fillna(fillna_mean, inplace=True)
    x_train = x_train.values.astype(np.float32)

    logger.info('data end')
    # x_train[np.isnan(x_train)] = -10
    gc.collect()
    x_train[np.isnan(x_train)] = -100
    x_train[np.isinf(x_train)] = 999

    logger.info('load end {}'.format(x_train.shape))
    return x_train, y_train, cv
Example #11
0
    def get_datas(self):
        datas, lbls = load_train_data()
        hot_labels = np_utils.to_categorical(lbls, self.n_classes)

        k1 = datas.shape[0]
        k2 = int(0.7 * k1)

        data_train, data_test = datas[:k2], datas[k2:]
        label_train, label_test = hot_labels[:k2], hot_labels[k2:]
        #        print(data_train.shape)
        d_train, l_train = self.shuffleing(data_train, label_train)
        d_test, l_test = self.shuffleing(data_test, label_test)

        return d_train, l_train, d_test, l_test
Example #12
0
def main():
    log_fmt = Formatter(
        '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s '
    )
    handler = StreamHandler()
    handler.setLevel(INFO)
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train_lgb_clf_hyperopt.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    logger.info("start exploring best params")

    logger.info("start exploring best params without iteration")
    df_train = load_train_data()
    x_train = df_train.loc[:, 'ABC':'2047']
    y_train = df_train['Active_Nonactive'].values
    best_params = lgb_opt_params(x_train, y_train)
    logger.info("end exploring best params without iteration")

    logger.info("start optimizing iteration")
    best_iter = opt_iter(x_train, y_train, best_params)
    logger.info("end optimizing iteration")

    logger.info("end exploring best params")

    logger.info("start best params train")
    best_model_No, cutoff = create_models(x_train, y_train, best_params,
                                          best_iter)
    logger.info("end best params train")

    logger.info("start predict unknown data(test data)")
    df_test = load_test_data().sort_values('Name')
    use_cols = x_train.columns.values
    # x_test = df_test[use_cols]
    df_all = pd.concat([df_train, df_test], axis=0,
                       sort=False).sort_values('Name')
    x_all = df_all[use_cols]
    predict_test(x_all, best_model_No, cutoff)
    logger.info("end predict unknown data(test data)")

    logger.info("end")
Example #13
0
 def validation(self, num_iterations):
     _, val_ext = load_train_data(self.batch_size, self.num_classes)
     start_time = time.time()
     for i in range(num_iterations):
         print "validating:", i
         #get batch
         x_batch, y_true_batch = val_ext.get_random_batch_balanced()
         #set feed_dict
         feed_dict_train = {self.x: x_batch, self.y_true: y_true_batch}
         #run session
         _, acc, loss = self.sess.run(
             [self.y_pred, self.accuracy, self.cost],
             feed_dict=feed_dict_train)
         print 'acc: ', acc * 100, 'loss: ', loss
     end_time = time.time()
     time_dif = end_time - start_time
     print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
Example #14
0
    def optimize(self, num_iterations, classes=3, save=False):
        train_ext, _ = load_train_data(self.batch_size, self.num_classes)

        checkpoint = "ckpt"
        logfile = "train_log"
        # log/training report
        log = {}
        # log['start_time'] = time.ctime()
        # log['alpha'] = alpha
        # log['batch_size'] = batch_size
        # log['steps'] = steps
        log['checkpoint'] = checkpoint
        log['loss'] = []

        ckpt_dir = 'checkpoints/' + str(datetime.datetime.now()) + '/'

        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)

        start_time = time.time()
        for i in range(self.total_iterations,
                       self.total_iterations + num_iterations):
            print "training:", i, "/", self.total_iterations
            #get batch
            x_batch, y_true_batch = train_ext.get_random_batch_balanced()
            #set feed_dict
            feed_dict_train = {self.x: x_batch, self.y_true: y_true_batch}
            #run session
            _, acc, loss = self.sess.run(
                [self.optimizer, self.accuracy, self.cost],
                feed_dict=feed_dict_train)

            print 'acc: ', acc * 100, 'loss: ', loss
            if save and i % 500 == 0:
                self.save(ckpt_dir + 'c-' + str(classes) + '-itt-' +
                          str(self.total_iterations))

        self.total_iterations += num_iterations
        end_time = time.time()
        time_dif = end_time - start_time
        print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
        # save final model
        if save:
            self.save(ckpt_dir + 'c-' + str(classes) + '-itt-' +
                      str(self.total_iterations))
def post_predict(test_path, score_path, entity_path, alpha=0.75):
    candidate_dict = load_candidates2(score_path)
    test_data, all_data = load_train_data(test_path)
    entity_dict, _ = load_entity(entity_path)

    acc_cnt, w_l = 0, ''

    predict_dict = dict()
    for mention, candidates in candidate_dict.items():
        if len(candidates) == 1:
            predict_dict[mention] = (candidates[0][0], candidates[0][1])
            continue
        max_score, max_can = candidates[0][2], candidates[0]
        for e_id, e_name, e_score in candidates:
            if e_score > max_score:
                max_score = e_score
                max_can = (e_id, e_name, e_score)

        e_id, e_name, e_score = max_can
        if e_score < alpha:
            e_id, e_name = 'cui-less', 'cui-less'
        predict_dict[mention] = (e_id, e_name)

    for doc_id, mention, label in all_data:
        if str.lower(label) == 'cui-less':
            label = 'cui-less'
        pred_label, pred_entity_name = predict_dict[mention]
        if pred_label == label:
            acc_cnt += 1
        else:
            entity_name = 'None'
            if label in entity_dict:
                entity_name = entity_dict[label][0]
            w_l += doc_id + '\t' + mention + '\t' + label + '\t' + \
                   entity_name + '\t' + pred_label + '\t' + pred_entity_name + '\n'

    with open('../checkpoints/post_predict_result.txt', 'w') as f:
        f.write(w_l)

    total_cnt = len(all_data)
    accuracy = 1.0 * acc_cnt / (total_cnt)
    return accuracy
Example #16
0
def model_selection_and_evaluation():
    """
    Test some candidate models with validation set, select highest scoring,
    train on full train + validation set, evluate on test set
    :return: tuple: best model, list of feature sets it uses
    """
    # Load train and test sets
    df_tr, df_te = load_data.load_train_data(), load_data.load_test_data()

    # Split train into validation (for model selection) and train
    df_tr_tr, df_tr_val = utils.split_train_validation(df_tr)

    # Assess accuracies of all models on validation set
    # Get best scoring canditate
    best_model, best_model_feats = model_selection(df_tr_tr, df_tr_val)

    print('Best scoring model is: {}, Using feature sets: {}'.format(
        best_model.name, best_model_feats))

    # Evaluate test set accuracy of chosen model
    test_set_evaluation(df_tr, df_te, {best_model: best_model_feats})

    return best_model, best_model_feats
Example #17
0
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from load_data import load_train_data, load_predict_data
from sklearn.datasets.base import Bunch
from sklearn.svm import SVC

import jieba


def jieba_tokenizer(x):
    return jieba.cut(x)


train_data = load_train_data(r"cuhk.csv")


def predict(n, x_test, y_test):
    #print(train_data)
    x_train, _, y_train, _ = train_test_split(train_data['data'],
                                              train_data['target'][n],
                                              test_size=0.5)
    #print(y_train)
    words_tfidf_vec = TfidfVectorizer(binary=False, tokenizer=jieba_tokenizer)
    X_train = words_tfidf_vec.fit_transform(x_train)
    print(train_data['types'][n])
    clf = SVC().fit(X_train, y_train)

    # 测试样本数据调用的是transform接口
        else:
            return True

    processed_texts = []
    for line, l in zip(tweets_list, tweets_labels):
        if isEnglish(line):
            processed_texts.append((l, preprocessor(line)))
        # else: # print or not ?
        #     print(line)

    os_name = get_os_name()
    if os_name == 'windows':
        file_dir = 'C:/Corpus/'
    elif os_name == 'ubuntu':
        file_dir = '/home/hs/Data/'
    else:
        return
    csv_save(processed_texts, file_dir + filename)


if __name__ == '__main__':
    # from load_data import load_test_data
    # test_texts, test_labels =load_test_data()
    # preprocess_tweeets(test_texts, test_labels, 'preprocessed_test_data_nostem_359.csv')
    # exit()

    from load_data import load_train_data
    texts, labels = load_train_data()
    processed_texts = []
    preprocess_tweeets(texts, labels, 'preprocessed_training_data_nostem_160000.csv')
Example #19
0
        '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s '
    )
    handler = StreamHandler()
    handler.setLevel('INFO')
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    df = load_train_data()

    x_train = df.drop('target', axis=1)
    y_train = df['target'].values

    use_cols = x_train.columns.values

    logger.debug('train columns: {} {}'.format(use_cols.shape, use_cols))

    logger.info('data preparation end {}'.format(x_train.shape))

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_params = {
        'C': [10**i for i in range(-1, 2)],
        'fit_intercept': [True, False],
        'penalty': ['l2', 'l1'],
Example #20
0
from sklearn.model_selection import train_test_split
from keras.callbacks import History
from keras.metrics import categorical_accuracy
from utils import convert_arrays_to_accuracy, TimeHistory
import augmentation as aug
import numpy as np

# m is the number of examples to load from the training dataset.
# Reducing m is especially useful when debugging to allow
# rapid training runs. For the full dataset, use m=60000.
m = 60000
epochs = 150
# n_transforms is the number of transformations to create for each image
n_transforms = 10

X, y = load_train_data(m)

# Ensure that we always use the same training and cross-validation sets
# by always using 1 as the seed for the PRNG.
np.random.seed(1)
Xtr, Xval, ytr, yval = train_test_split(X, y, train_size=0.6, test_size=0.4)
Xtr, ytr = aug.augment_dataset(Xtr, ytr, n_transforms, fixed_seeds=True)

model = model_build_dense()
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=[categorical_accuracy])
my_hist = History()
time_hist = TimeHistory()
# Because we are feeding an already augmented dataset to model.fit,
# the training categorical accuracy returned by the model will be
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('SVM classifier training complete, saved predict labels to pickle')
    return


def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('MaxEnt classifier training complete, saved predict labels to pickle')
    return


def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('kNN classifier training complete, saved predict labels to pickle')
    return

if __name__ == "__main__":
    train_data = load_pickle('./data/transformed_data/transformed_train.p')
    test = load_pickle('./data/transformed_data/transformed_test.p')
    _, train_labels = load_train_data()
    mNB(train_data, train_labels, test)
Example #22
0
import numpy as np


###############################
# Untar data
def untar_data(name, outdir='./data'):
    my_tar = tarfile.open('./Indoor-scene-recognition/' + name)
    my_tar.extractall(outdir)
    my_tar.close()


# Uncomment to untar data
# untar_data("indoorCVPR_09annotations.tar")
# untar_data("indoorCVPR_09.tar")
###############################

###############################
# Load data
test_data = load_data.load_test_data()
train_data = load_data.load_train_data()

# Show the data
print(test_data.shape)
print(train_data.shape)
train_i = np.random.choice(train_data.shape[0])
test_i = np.random.choice(test_data.shape[0])
cv2.imshow("example in train", train_data[train_i])
cv2.imshow("example in test", test_data[test_i])
cv2.waitKey(0)

###############################
Example #23
0
        df['description_' + str(i)] = trans_desc[:, i]

    title = vec.fit_transform(df['title'])
    title = np.array(title.todense(), dtype=np.float32)
    pca.fit(title)
    trans_title = pca.fit_transform(title)
    print(pca.explained_variance_ratio_)
    print(np.cumsum(pca.explained_variance_ratio_))
    for i in range(19):
        df['title_' + str(i)] = trans_title[:, i]
    return df


if __name__ == '__main__':
    logger.debug('start load train data')
    df_train = load_train_data()
    #df_train = df_train.iloc[:10000,:]
    X_train = df_train.drop(['deal_probability'], axis=1)
    y_train = df_train['deal_probability']

    logger.debug('start load test data')
    X_test = load_test_data()

    logger.debug('start fill null')
    X_train = fill_null(X_train)
    X_test = fill_null(X_test)

    X_train["Weekday"] = X_train['activation_date'].dt.weekday
    X_train["Weekd of Year"] = X_train['activation_date'].dt.week
    X_train["Day of Month"] = X_train['activation_date'].dt.day
    X_test["Weekday"] = X_test['activation_date'].dt.weekday
__author__ = 'NLP-PC'
__author__ = 'NLP-PC'
import feature_generating
import classifiers
import analysis
from load_data import load_train_data
from load_data import load_test_data
from save_data import dump_picle
from vectorizers import TFIDF_estimator, anew_estimator
from analysis import analysis_result
from classifiers import mNB

print('Start')
vectorizer = TFIDF_estimator()
train_type = 'Sentiment140'
texts, train_labels = load_train_data(train_type)
transformed_train = vectorizer.fit_transform(texts)
testdata, true_labels = load_test_data()
transformed_test = vectorizer.transform(testdata)

predict = mNB(transformed_train, train_labels, transformed_test)

analysis_result(predict, true_labels)
Example #25
0
                    "--train_or_predict",
                    type=bool,
                    default=True,
                    help="train_or_predict")
parser.add_argument("-l", "--layer1", type=int, default=1000)
parser.add_argument("-ll", "--layer2", type=int, default=200)
args = parser.parse_args()

####################################################
##Load data and set up training hyperparameters
####################################################
DATA_DIR = './data/'
pro_dir = os.path.join(DATA_DIR, 'pro_sg_tag')
unique_sid = load_pro_data(os.path.join(pro_dir, 'unique_sid.txt'))
n_tags = len(unique_sid)
train_data = load_train_data(os.path.join(pro_dir, 'train.csv'), n_tags)
vad_data_tr, vad_data_te = load_tr_te_data(
    os.path.join(pro_dir, 'validation_tr.csv'),
    os.path.join(pro_dir, 'validation_te.csv'), n_tags)
test_data_tr, test_data_te = load_tr_te_data(
    os.path.join(pro_dir, 'test_tr.csv'), os.path.join(pro_dir, 'test_te.csv'),
    n_tags)

N = train_data.shape[0]
N_vad = vad_data_tr.shape[0]
N_test = test_data_tr.shape[0]

idxlist = list(range(N))
idxlist_vad = range(N_vad)
idxlist_test = range(N_test)
Example #26
0
    log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
    handler = StreamHandler()
    handler.setLevel('INFO')
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    df_train0 = load_train_data()
    df_test0 = load_test_data()

    logger.info('concat train and test datasets: {} {}'.format(df_train0.shape, df_test0.shape))

    df_train0['train'] = 1
    df_test0['train'] = 0
    df = pd.concat([df_train0, df_test0], axis=0, sort=False)

    logger.info('Data preprocessing')

    # Drop PoolQC, MiscFeature, Alley and Fence features
    # because they have more than 80% of missing values.
    df = df.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1)

    object_columns_df = df.select_dtypes(include=['object'])
Example #27
0
        train_x.shape, train_y.shape))

    test_x = nan_train_x.drop(nan_column, axis=1)
    logger.info('create test data from nan_train_x:{}'.format(test_x.shape))

    lr = LinearRegression().fit(train_x, train_y)
    logger.info('lr fitted')

    test_y = lr.predict(test_x)
    logger.info('lr predicted:{}'.format(test_y.shape))
    test_x['Age'] = test_y
    logger.info('test_x.shape:{}  test_y.shape:{}'.format(
        test_x.shape, test_y.shape))
    df_x = pd.concat([test_x, non_nan_train_x])
    logger.info('df_temp.shape:{}  non_nan_train_x.shape:{}'.format(
        test_x.shape, non_nan_train_x.shape))

    return df_x


if __name__ == '__main__':
    logger.info('enter')
    train_x, train_y = load_train_data()
    nan_train_x, non_nan_train_x = load_data_nan(train_x, 'Age')
    logger.info('load_data_nan loaded')
    logger.info('nan_train_x.shape:{}'.format(nan_train_x.shape))
    logger.info('non_nan_train_x.shape:{}'.format(non_nan_train_x.shape))
    df_x = nan_data_predict(nan_train_x, non_nan_train_x, 'Age')
    logger.info('result:{}'.format(df_x.shape))
    logger.info('end')
def train(argv=None):
    # load data
    print("Loading data ... ")
    x_train, y_train = load_data.load_train_data()
    x_test, y_test = load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    x_train = x_shuffled[1000:]
    y_train = y_shuffled[1000:]
    x_test = x_shuffled[:1000]
    y_test = y_shuffled[:1000]

    print(x_train.shape)
    print(x_test.shape)

    # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1)
    x_train = numpy.expand_dims(x_train, -1)
    x_test = numpy.expand_dims(x_test, -1)

    filter_sizes = [2, 3, 4, 5]
    filter_numbers = [300, 200, 100, 50]

    # input
    # input is sentence
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(None, max_document_length,
                                            EMBEDDING_SIZE, NUM_CHANNELS))

    train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # full connected - softmax layer,
    fc1_weights = tf.Variable(
        tf.truncated_normal([sum(filter_numbers), NUM_CLASSES],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    fc1_biases = tf.Variable(
        tf.constant(0.1, shape=[NUM_CLASSES], dtype=tf.float32))

    # model
    def model(data):
        pooled_outputs = []
        for idx, filter_size in enumerate(filter_sizes):
            conv = conv2d(train_data_node,
                          filter_numbers[idx],
                          filter_size,
                          EMBEDDING_SIZE,
                          name="kernel%d" % idx)
            # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters]
            pool = tf.nn.max_pool(
                conv,
                ksize=[1, max_document_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID')
            pooled_outputs.append(tf.squeeze(pool))

        if len(filter_sizes) > 1:
            cnn_output = tf.concat(1, pooled_outputs)
        else:
            cnn_output = pooled_outputs[0]

        # add dropout
        reshape = tf.nn.dropout(cnn_output, dropout_keep_prob)
        # fc1 layer
        fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases
        return fc1_output

    # Training computation
    logits = model(train_data_node)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node))
    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases))
    loss += 0.05 * regularizers

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")
    # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True)

    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model
    train_predict = tf.argmax(logits, 1)
    train_label = tf.argmax(train_labels_node, 1)
    # train accuracy
    train_correct_pred = tf.equal(train_predict, train_label)
    train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32))
    tf.scalar_summary('acc', train_accuracy)
    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        feed_dict = {
            train_data_node: x_batch,
            train_labels_node: y_batch,
            dropout_keep_prob: 1.0
        }
        # Run the graph and fetch some of the nodes.
        # test dont apply train_op (train_op is update gradient).
        summary, step, losses, lr, acc, y_label, y_predict = sess.run(
            [
                merged, global_step, loss, learning_rate, train_accuracy,
                train_label, train_predict
            ],
            feed_dict=feed_dict)
        test_writer.add_summary(summary, step)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format(
            time_str, step, losses, lr, acc))
        # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc))
        # compute index
        compute_index(y_label, y_predict)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (step % steps_each_check < 100) and (step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - losses
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = losses

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
                                              sess.graph)
        test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g},acc {:g}".format(
                        time_str, step, losses, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, losses, acc))

        train_writer.close()
        test_writer.close()
Example #29
0
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xg

from load_data import load_test_data, load_train_data
from data_cleaning import clean_data

# laod data
data = clean_data(load_train_data())

# split data into training/testing sets
train,test=train_test_split(data,test_size=0.3,random_state=0,stratify=data['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=data[data.columns[1:]]
Y=data['Survived']

# Radial Support Vector Machines(rbf-SVM)
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))
Example #30
0
    min_params = xgb_gs(xg_params, xg_trn, trn_y, xg_val, val_y, wl=watchlist)

    model = xgb.train(min_params,
                      xg_trn,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    return model


if __name__ == '__main__':
    logger.info('Start')

    train_df = load_train_data(nrows=100)
    logger.info('train load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=100)
    logger.info('test load end {}'.format(test_df.shape))

    # Labels
    train_y = train_df["deal_probability"].values
    test_id = test_df["item_id"].values

    # Feature Weekday
    train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
    test_df["activation_weekday"] = test_df["activation_date"].dt.weekday

    # Label encode the categorical variables
    cat_vars = [
Example #31
0
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')
    dump_picle(transformed_train, "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
Example #32
0
    df_tmp['feat_i'] = df_tmp['feat_i'] / df_tmp['feat_i'].sum()
    
    # for i in range(len(df_tmp.index)):
    for i in range(15):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(
                            df_tmp.ix[i, 0], df_tmp.ix[i, 1]))
    return model

if __name__ == '__main__':
    logger.info('Start')

    # temp1_df = load_train_data(nrows=ROW)
    # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv')
    # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left')
    # del temp1_df, temp2_df
    train_df = load_train_data(nrows=ROW)
    logger.info('Train Data load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=ROW)
    logger.info('test load end {}'.format(test_df.shape))

    # test_df = load_period_train_data(nrows=ROW)
    # logger.info('period train load end {}'.format(test_df.shape))

    # pr_test_df = load_period_test_data(nrows=ROW)
    # logger.info('period test load end {}'.format(pr_test_df.shape))

    # test_df = load_train_act_data(nrows=ROW)
    # tmp_df = pd.read_csv(TRN_PRED_FILE, index_col=['item_id'])
    # trn_act_df = load_train_act_data(nrows=ROW)
    # trn_act_df = trn_act_df.join(tmp_df, how='left')
Example #33
0
def train():
    df = load_train_data(
    )  # .sample(10000000, random_state=42).reset_index(drop=True)

    logger.info('train data size {}'.format(df.shape))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)

    train, test = next(cv.split(df, df.is_attributed))

    x_train = df.drop(['is_attributed', 'click_id'], axis=1).astype(
        np.float32)  # .loc[train].reset_index(drop=True)
    y_train = df.is_attributed.astype(int)  # .values[train]

    df = load_valid_data(
    )  # .sample(x_train.shape[0], random_state=42).reset_index(drop=True)
    logger.info('valid data size {}'.format(df.shape))
    x_valid = df.drop(['is_attributed', 'click_id'], axis=1).astype(
        np.float32)  # .loc[test].reset_index(drop=True)
    y_valid = df.is_attributed.astype(int)  # .values[test]

    del df
    gc.collect()
    usecols = x_train.columns.values
    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)
    # {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': -1, 'metric': 'auc', 'min_child_weight': 20, 'min_split_gain': 0, 'num_leaves': 127, 'objective': 'binary', 'reg_alpha': 0, 'scale_pos_weight': 1, 'seed': 114, 'subsample': 1.0, 'subsample_freq': 1, 'verbose': -1}
    all_params = {
        'min_child_weight': [20],
        'subsample': [1],
        'subsample_freq': [1],
        'seed': [114],
        'colsample_bytree': [0.9],
        'learning_rate': [0.1],
        'max_depth': [-1],
        'min_split_gain': [0],
        'reg_alpha': [0],
        'max_bin': [255],
        'num_leaves': [127],
        'objective': ['binary'],
        'metric': ['auc'],
        'scale_pos_weight': [1],
        'verbose': [-1],
        #'device': ['gpu'],
        'drop': [None] + list(range(0, len(usecols)))
    }
    use_score = 0
    min_score = (100, 100, 100)
    drop_cols = []
    import copy
    for params in tqdm(list(ParameterGrid(all_params))):
        cnt = -1
        list_score = []
        list_score2 = []
        list_best_iter = []
        all_pred = np.zeros(y_train.shape[0])
        if 1:
            cnt += 1
            trn_x = x_train.copy()
            val_x = x_valid.copy()
            trn_y = y_train
            val_y = y_valid

            _params = copy.deepcopy(params)
            drop_idx = _params.pop('drop')
            if drop_idx is not None:
                drop_col = drop_cols + [usecols[drop_idx]]
            else:
                drop_col = []
            params['drop'] = drop_col

            trn_x.drop(drop_col, axis=1, inplace=True)
            val_x.drop(drop_col, axis=1, inplace=True)
            cat_feat = CAT_FEAT
            cols = trn_x.columns.values.tolist()
            train_data = lgb.Dataset(
                trn_x.values.astype(np.float32),
                label=trn_y,
                # categorical_feature=cat_feat,
                feature_name=cols)
            test_data = lgb.Dataset(
                val_x.values.astype(np.float32),
                label=val_y,
                # categorical_feature=cat_feat,
                feature_name=cols)
            del trn_x
            gc.collect()

            clf = lgb.train(
                _params,
                train_data,
                10,  # params['n_estimators'],
                early_stopping_rounds=30,
                valid_sets=[test_data],
                # feval=cst_metric_xgb,
                # callbacks=[callback],
                verbose_eval=10)
            pred = clf.predict(val_x.values.astype(np.float32))

            # all_pred[test] = pred

            _score2 = log_loss(val_y, pred)
            _score = -roc_auc_score(val_y, pred)
            logger.info(f'drop: {drop_col}')
            logger.info('   _score: %s' % _score)
            logger.info('   _score2: %s' % _score2)

            list_score.append(_score)
            list_score2.append(_score2)
        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))
        if min_score[use_score] > score[use_score]:
            min_score = score
            min_params = params
            drop_cols = drop_col
        logger.info('best score: {} {}'.format(min_score[use_score],
                                               min_score))
        logger.info('best params: {}'.format(min_params))