Ejemplo n.º 1
0
    def run_experiment(self, exp_list):
        p = multiprocessing.Pool()
        for e in exp_list:
            nn = self.log_dir + '/' + e['exp_name'] + '_' + str(
                random.random()) + '.txt'
            self.log_files.append(nn)
            e['nn'] = nn

        result = (p.map(self.csdnn_helper, exp_list))
        p.close()
        # result = [self.csdnn_helper(ps) for ps in exp_list]
        eval_result = []
        predict_probs_train = []
        predict_probs_test = []
        #y_pred_positive =[]
        # (train_result, test_result, p_dict),y_pred_score_test
        for r in result:
            eval_result.append(r[0])
            predict_probs_test.append(r[1])
            #y_pred_positive.append(r[1][:,1])

        sum_probabilities = np.sum(predict_probs_test, axis=0)
        y_pred = np.argmax(sum_probabilities, axis=1)
        # y_pred_max_on_positive = np.max(y_pred_positive,axis=0)

        test_path = exp_list[0]['test_path']
        _, y = load_data_from_path(test_path)
        y = np.argmax(y.eval(), axis=1)
        TPR, TNR = get_tpr_tnr(y, y_pred)
        best_confusion_matrix = confusion_matrix(y, y_pred, labels=[0, 1])
        #AUC_of_max_prob = metrics.roc_auc_score(y, y_pred_max_on_positive)
        combine_log(path=self.log_dir + '/combine_logs.txt',
                    file_list=self.log_files)
        return eval_result, [TPR, TNR], best_confusion_matrix
Ejemplo n.º 2
0
def compute_result_from_pretrain_model(main_model_path, train_path, test_path):
    train_result = []
    test_result = []
    model = load_model(main_model_path)
    train_set_x, train_set_y = load_data_from_path(train_path)
    test_set_x, test_set_y = load_data_from_path(test_path)
    ff_test = model.test_model([train_set_x, train_set_y])
    [y_pred, y_pred_score_train] = ff_test()
    y = np.argmax(train_set_y.eval(), axis=1)
    TPR, TNR = get_tpr_tnr(y, y_pred)
    AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1])
    train_result.append([TPR, TNR, AUC])

    ff_test = model.test_model([test_set_x, test_set_y])
    [y_pred, y_pred_score_train] = ff_test()
    y = np.argmax(test_set_y.eval(), axis=1)
    TPR, TNR = get_tpr_tnr(y, y_pred)
    AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1])
    test_result.append([TPR, TNR, AUC])
    return train_result, test_result
Ejemplo n.º 3
0
def test_SdA(finetune_lr=0.1,
             pretraining_epochs=1,
             pretrain_lr=0.001,
             training_epochs=1,
             batch_size=10,
             h=[10, 10],
             cl=[.1, .2],
             cost_vec=[1, 1.2],
             beta=30,
             logger=None):

    logger.info(
        'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s'
        % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, h,
           cl, cost_vec, beta))
    cost_vec = numpy.array(cost_vec, dtype="float32")
    train_result = []
    test_result = []
    auc_list = []
    test_lift = [[], []]
    hidden_l_size = h
    num_of_fold = 1
    for mm in range(num_of_fold):
        # for mm in range(5,6):
        logger.info("Trail K=%d" % (mm + 1))
        logger.info('Load data from %s', data_dir)
        datasets = load_data_lift(mm + 1, data_dir)

        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        profit_train, profit_test = datasets[3]
        profit_train = get_cost_vector(profit_train, beta, cost_vec[1])
        datasets[3] = (profit_train, profit_test)
        # compute number of minibatches for training, validation and testing
        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        n_train_batches //= batch_size

        # numpy random generator
        # start-snippet-3
        numpy_rng = numpy.random.RandomState(89677)
        logger.info('... building the model')
        # construct the stacked denoising autoencoder class
        sda = SdA(
            numpy_rng=numpy_rng,
            n_ins=train_set_x.eval().shape[1],
            hidden_layers_sizes=hidden_l_size,
            n_outs=2,
            costVec=cost_vec,
        )
        #    corruption_levels = [.1, .2]
        corruption_levels = cl

        #########################
        # PRETRAINING THE MODEL #
        #########################
        logger.info('... getting the pretraining functions')
        pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,
                                                    batch_size=batch_size)

        logger.info('... pre-training the model')

        ## Pre-train layer-wise

        for i in range(sda.n_layers):
            # go through pretraining epochs
            for epoch in range(pretraining_epochs):
                # go through the training set
                c = []
                for batch_index in range(n_train_batches):
                    c.append(pretraining_fns[i](
                        index=batch_index,
                        corruption=corruption_levels[i],
                        lr=pretrain_lr))
                logger.info('Pre-training layer %i, epoch %d, cost %f' %
                            (i, epoch, numpy.mean(c, dtype='float64')))

        # logger.info(('The pretraining dllib for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
        # end-snippet-4
        ########################
        # FINETUNING THE MODEL #
        ########################

        # get the training, validation and testing function for the model

        logger.info('... getting the finetuning functions')
        train_fn, validate_model, test_model = sda.build_finetune_functions(
            datasets=datasets,
            batch_size=batch_size,
            learning_rate=finetune_lr,
        )

        logger.info('... finetunning the model')
        # early-stopping parameters
        patience = 10 * n_train_batches  # look as this many examples regardless
        patience_increase = 2.  # wait this much longer when a new best is
        # found
        improvement_threshold = 0.995  # a relative improvement of this much is
        # considered significant
        validation_frequency = min(n_train_batches, patience // 2)
        # go through this many
        # minibatche before checking the network
        # on the validation set; in this case we
        # check every epoch

        best_validation_loss = numpy.inf
        test_score = 0.

        done_looping = False
        epoch = 0

        while (epoch < training_epochs) and (not done_looping):
            epoch += 1
            for minibatch_index in range(n_train_batches):
                minibatch_avg_cost = train_fn(minibatch_index)
                # logger.info ("AVG_COST:%f",(minibatch_avg_cost))
                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:
                    validation_losses = validate_model()
                    # this_validation_loss = numpy.mean(validation_losses, dtype='float64')
                    ff_test = sda.test_model(datasets[1])
                    [y_pred, y_pred_score] = ff_test()
                    # get_any = sda.get_any(datasets[0], profit_train)
                    # a,b=get_any()
                    # get_cost = sda.get_cost(datasets[0], profit_train)
                    # d = get_cost()
                    # s = dbn.getSoftmaxresult(datasets[2])
                    # softm = s()
                    y = np.argmax(datasets[1][1].eval(), axis=1)
                    tn, fp, fn, tp = confusion_matrix(y, y_pred,
                                                      labels=[0, 1]).ravel()

                    # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1]))
                    #  logger.info(confusion_matrix(y, y_pred, labels=[0, 1]))
                    TPR, TNR = get_tpr_tnr(y, y_pred)
                    temp = [0, 0]
                    # logger.info('TPR=', TPR)
                    temp[0] = TPR
                    # logger.info('TNR=', TNR)
                    temp[1] = TNR
                    # this_validation_loss = numpy.abs((TPR-TNR))
                    this_validation_loss = numpy.mean(validation_losses,
                                                      dtype='float64')
                    logger.info(
                        'epoch %i, minibatch %i/%i, validation partial error %f %%'
                        % (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:

                        # improve patience if loss improvement is good enough
                        if (this_validation_loss <
                                best_validation_loss * improvement_threshold):
                            patience = max(patience, iter * patience_increase)

                        # save best validation score and iteration number
                        best_validation_loss = this_validation_loss
                        best_iter = iter

                        # test it on the test set
                        test_losses = test_model()
                        test_score = numpy.mean(test_losses, dtype='float64')
                        logger.info(
                            ('     epoch %i, minibatch %i/%i, test error of '
                             'best model %f %%') %
                            (epoch, minibatch_index + 1, n_train_batches,
                             test_score * 100.))
                        logger.info("========new best with testset=========")
                        ff_test = sda.test_model(datasets[2])
                        [y_pred, y_pred_score] = ff_test()
                        y = np.argmax(datasets[2][1].eval(), axis=1)

                        response_lift, profit_lift = evaluate_decile(
                            prob=y_pred_score,
                            label=y,
                            actual_profit=profit_test,
                            isPlot=True)
                        logger.info('response_lift:%s', response_lift)

                        logger.info('profit lift:%s', profit_lift)
                        # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1]))
                        best_confusion_matrix = confusion_matrix(y,
                                                                 y_pred,
                                                                 labels=[0, 1])
                        logger.info(best_confusion_matrix)
                        best_test = [0, 0, 0]
                        # best_response_lift = []
                        # best_profit_lift = []
                        TPR, TNR = get_tpr_tnr(y, y_pred)
                        best_test[0] = TPR
                        best_test[1] = TNR
                        best_test[2] = metrics.roc_auc_score(
                            y, y_pred_score[:, 1])
                        best_response_lift = (response_lift)
                        best_profit_lift = (profit_lift)
                        logger.info('AUC=%s', best_test[2])
                        logger.info('TPR=%s', TPR)
                        logger.info('TNR=%s', TNR)
                        #                        test_result[mm] = best_test

                        # train
                        ff_test = sda.test_model(datasets[0])
                        [y_pred, y_pred_score] = ff_test()
                        y = np.argmax(datasets[0][1].eval(), axis=1)
                        best_train = [0, 0, 0]
                        TPR, TNR = get_tpr_tnr(y, y_pred)
                        best_train[0] = TPR
                        best_train[1] = TNR
                        best_train[2] = metrics.roc_auc_score(
                            y, y_pred_score[:, 1])

                        #                       train_result[mm] = best_train
                if patience <= iter:
                    done_looping = True
                    break

        # logger.info RESULT

        logger.info("========Test with training set=========")
        # ff = sda.test_model(datasets[0])
        # [y_pred, y_pred_score] = ff()
        # y = np.argmax(datasets[0][1].eval(),axis=1)
        # logger.info(confusion_matrix(y, y_pred, labels=[0, 1]))
        # temp = [0, 0]
        # TPR,TNR = get_tpr_tnr(y,y_pred)
        # temp[0] = TPR
        # temp[1] = TNR
        logger.info('AUC=%s', best_train[2])
        logger.info('TPR=%s', best_train[0])
        logger.info('TNR=%s', best_train[1])

        train_result.append(best_train)
        # evaluate
        logger.info("========Test with test set=========")
        # ff_test = sda.test_model(datasets[2])
        # [y_pred, y_pred_score] = ff_test()
        # y = np.argmax(datasets[2][1].eval(),axis=1)
        # auc = metrics.roc_auc_score(y, y_pred_score[:,1])
        # logger.info('AUC=', auc)
        # auc_list.append(auc)

        # ============

        # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1]))
        # logger.info(confusion_matrix(y, y_pred, labels=[0, 1]))
        # TPR,TNR = get_tpr_tnr(y,y_pred)
        logger.info('Response_lift=%s', best_response_lift)
        logger.info('Profit_lift=%s', best_profit_lift)
        logger.info('AUC=%s', best_test[2])
        logger.info('TPR=%s', best_test[0])
        logger.info('TNR=%s', best_test[1])

        test_result.append(best_test)
        # test_lift[0] = np.vstack([test_lift[0], best_response_lift])
        # test_lift[1] = np.vstack([test_lift[1], best_profit_lift])
        test_lift[0].append(best_response_lift)
        test_lift[1].append(best_profit_lift)
        logger.info(
            ('Optimization complete with best validation score of %f %%, '
             'on iteration %i, '
             'with test performance %f %%') %
            (best_validation_loss * 100., best_iter + 1, test_score * 100.))
        # logger.info(('The training dllib for file ' +
        #        os.path.split(__file__)[1] +
        #        ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
    logger.info(
        'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s'
        % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr,
           hidden_l_size, cl, cost_vec, beta))
    logger.info("====Overall: trainset====")
    for p, n, a in train_result:
        logger.info("%f\t%f\t%f" % (p, n, a))
    mean = np.mean(train_result, axis=0)
    logger.info('Mean=%f\t%f\t%f' % (mean[0], mean[1], mean[2]))
    logger.info("====Overall: testset====")
    temp_response = []
    temp_profit = []
    mean = []
    for p, n, a in test_result:
        logger.info("%f\t%f\t%f" % (p, n, a))

    mean = np.mean(test_result, axis=0)

    logger.info('Mean=%f\t%f\t%f' % (mean[0], mean[1], mean[2]))
    mean_response_lift = np.mean(test_lift[0], axis=0)
    mean_profit_lift = np.mean(test_lift[1], axis=0)
    logger.info('Mean Response lift=%s', mean_response_lift)
    logger.info('Mean Profit lift=%s', mean_profit_lift)
    # logger.info("====Overall: AUC for testset====")
    # for a in auc_list:
    #     logger.info("%f" % a)
    #train_all.append(train_result)
    # test_all.append(test_result)
    #  test_lift_all.append((test_lift[0], test_lift[1]))
    return train_result, test_result, test_lift
Ejemplo n.º 4
0
def csdnn_classifier(log_dir, logger, p_dict):

    finetune_lr = p_dict['finetune_lr']
    pretraining_epochs = p_dict['pretraining_epochs']
    pretrain_lr = p_dict['pretrain_lr']
    training_epochs = p_dict['training_epochs']
    batch_size = p_dict['batch_size']
    h = p_dict['h']
    cl = p_dict['cl']
    cost_vec = p_dict['cost_vec']
    beta = p_dict['beta']
    reg_coef = p_dict['reg_coef']
    pretrain_batchsize = p_dict['pretrain_batchsize']
    drop_ps = p_dict['drop_ps']
    exp_name = p_dict['exp_name']
    train_path = p_dict['train_path']
    test_path = p_dict['test_path']
    valid_path = p_dict['valid_path']
    logger.info(
        'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s'
        % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, h,
           cl, cost_vec, beta))
    cost_vec = np.array(cost_vec, dtype="float32")
    train_result = []
    test_result = []
    hidden_l_size = h
    logger.info('Load data from %s', train_path)
    train_set_x, train_set_y = load_data_from_path(train_path)
    test_set_x, test_set_y = load_data_from_path(test_path)
    valid_set_x, valid_set_y = load_data_from_path(valid_path)
    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_pretrain_batches = int(n_train_batches / pretrain_batchsize)
    n_train_batches //= batch_size

    # numpy random generator
    numpy_rng = np.random.RandomState(89677)
    logger.info('... building the model')
    # construct the stacked denoising autoencoder class
    sda = SdA_Ensemble(numpy_rng=numpy_rng,
                       n_ins=train_set_x.eval().shape[1],
                       hidden_layers_sizes=hidden_l_size,
                       n_outs=2,
                       costVec=cost_vec,
                       reg_coef=reg_coef,
                       drop_ps=drop_ps)
    corruption_levels = cl

    #########################
    # PRETRAINING THE MODEL #
    #########################
    if pretraining_epochs > 0:
        logger.info('... getting the pretraining functions')
        pretraining_fns = sda.pretraining_functions(
            train_set_x=train_set_x, batch_size=pretrain_batchsize)
        logger.info('... pre-training the model')

    ## Pre-train layer-wise

    for i in range(sda.n_layers):
        # go through pretraining epochs
        for epoch in range(pretraining_epochs):
            # go through the training set
            c = []
            for batch_index in range(n_pretrain_batches):
                c.append(pretraining_fns[i](index=batch_index,
                                            corruption=corruption_levels[i],
                                            lr=pretrain_lr))
            logger.info('Pre-training layer %i, epoch %d, cost %f' %
                        (i, epoch, np.mean(c, dtype='float64')))

    ########################
    # FINETUNING THE MODEL #
    ########################

    # get the training, validation and testing function for the model

    logger.info('... getting the finetuning functions')
    train_fn, validate_model, test_model = sda.build_finetune_functions(
        datasets=[(train_set_x, train_set_y), (valid_set_x, valid_set_y),
                  (test_set_x, test_set_y)],
        batch_size=batch_size,
        learning_rate=finetune_lr,
    )

    logger.info('... finetunning the model')
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 3  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # improvement_threshold = 1.005
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    print('patience:%s, validation_freq:%s' % (patience, validation_frequency))
    best_validation_loss = np.inf
    test_score = 0.

    done_looping = False
    epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch += 1
        for minibatch_index in range(n_train_batches):
            minibatch_avg_cost = train_fn(minibatch_index)
            # logger.info ("AVG_COST:%f",(minibatch_avg_cost))
            iter = (epoch - 1) * n_train_batches + minibatch_index
            # print('iter:%s' % iter)
            if (iter + 1) % validation_frequency == 0:
                validation_losses = validate_model()
                ff_test = sda.test_model([valid_set_x, valid_set_y])
                [y_pred, y_pred_score] = ff_test()
                y = np.argmax(valid_set_y.eval(), axis=1)
                TPR, TNR = get_tpr_tnr(y, y_pred)
                # this_validation_loss = np.abs((TPR - TNR))
                this_validation_loss = 1 - metrics.roc_auc_score(
                    y, y_pred_score[:, 1])
                # this_validation_loss = np.mean(validation_losses, dtype='float64')
                logger.info(
                    'epoch %i, minibatch %i/%i, validation partial error %f %%'
                    % (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        print('increase patience from:%s' % patience)
                        patience = max(patience, iter * patience_increase)
                        print('to:%s' % patience)
                    # Save model
                    logger.info("Save best model by pickle")
                    model_name = log_dir + '/tmp_model/' + exp_name + '_'
                    save_file = open(model_name, 'wb')
                    cPickle.dump(sda, save_file)
                    save_file.close()
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    logger.info("========new best with testset=========")
                    ff_test = sda.test_model([test_set_x, test_set_y])
                    [y_pred, y_pred_score] = ff_test()
                    y = np.argmax(test_set_y.eval(), axis=1)

                    best_confusion_matrix = confusion_matrix(y,
                                                             y_pred,
                                                             labels=[0, 1])
                    logger.info(best_confusion_matrix)
                    TPR, TNR = get_tpr_tnr(y, y_pred)
                    AUC = metrics.roc_auc_score(y, y_pred_score[:, 1])
                    logger.info('AUC=%s', AUC)
                    logger.info('TPR=%s', TPR)
                    logger.info('TNR=%s', TNR)

            if patience <= iter:
                done_looping = True
                break
    # load best model
    model = load_model(model_name)
    logger.info("========Test with training set=========")
    ff_test = model.test_model([train_set_x, train_set_y])
    [y_pred, y_pred_score_train] = ff_test()
    y = np.argmax(train_set_y.eval(), axis=1)
    TPR, TNR = get_tpr_tnr(y, y_pred)
    AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1])
    logger.info('AUC=%s', AUC)
    logger.info('TPR=%s', TPR)
    logger.info('TNR=%s', TNR)
    train_result.append([TPR, TNR, AUC])

    # evaluate
    logger.info("========Test with test set=========")
    ff_test = model.test_model([test_set_x, test_set_y])
    [y_pred, y_pred_score_test] = ff_test()
    y = np.argmax(test_set_y.eval(), axis=1)
    TPR, TNR = get_tpr_tnr(y, y_pred)
    AUC = metrics.roc_auc_score(y, y_pred_score_test[:, 1])
    logger.info('AUC=%s', AUC)
    logger.info('TPR=%s', TPR)
    logger.info('TNR=%s', TNR)
    test_result.append([TPR, TNR, AUC])

    return train_result, test_result, model_name, y_pred_score_test