Beispiel #1
0
def test_fine_tune_all_nodes(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Chain composition
    chain = get_class_chain()

    # Before tuning prediction
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    # root node tuning
    chain.fine_tune_all_nodes(train_data,
                              max_lead_time=timedelta(minutes=1),
                              iterations=30)
    chain.fit_from_scratch(train_data)
    after_tun_root_node_predicted = chain.predict(test_data)

    bfr_tun_roc_auc = round(
        roc(y_true=test_data.target, y_score=before_tuning_predicted.predict),
        2)
    aft_tun_roc_auc = round(
        roc(y_true=test_data.target,
            y_score=after_tun_root_node_predicted.predict), 2)

    print(f'Before tune test {bfr_tun_roc_auc}')
    print(f'After tune test {aft_tun_roc_auc}', '\n')

    assert aft_tun_roc_auc >= bfr_tun_roc_auc
Beispiel #2
0
def test_tune_certain_node_with_tune_class_correctly(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    chain = create_four_depth_chain()
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    model_id_to_tune = 4

    tuned_chain = Tune(chain).fine_tune_certain_node(
        model_id=model_id_to_tune,
        input_data=train_data,
        max_lead_time=timedelta(minutes=1),
        iterations=30)

    tuned_chain.fit_from_scratch(train_data)
    after_tun_root_node_predicted = tuned_chain.predict(test_data)

    bfr_tun_roc_auc = round(
        roc(y_true=test_data.target, y_score=before_tuning_predicted.predict),
        1)
    aft_tun_roc_auc = round(
        roc(y_true=test_data.target,
            y_score=after_tun_root_node_predicted.predict), 1)

    print(f'Before tune test {bfr_tun_roc_auc}')
    print(f'After tune test {aft_tun_roc_auc}', '\n')

    assert aft_tun_roc_auc >= bfr_tun_roc_auc
Beispiel #3
0
def roc_auc(y_true, y_score, pos_label=None, ascending_score=True):
    """Computes ROC AUC score

    Parameters
    ----------
        y_true : array, shape=[n_samples]
            True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined.

        y_score : array, shape=[n_samples]
            Scores for tested series of samples

        pos_label: int
            Positive label of samples (if other than 1)

        ascending_score: bool (default=True)
            Indicates if your score is ascendig. Ascending score icreases with deacreasing activity. In other words it ascends on ranking list (where actives are on top).

    Returns
    -------
        ef : float
            Enrichment Factor for given percenage in range 0:1
    """
    if ascending_score:
        y_score = -y_score
    fpr, tpr, tresholds = roc(y_true, y_score, pos_label=pos_label)
    return auc(fpr, tpr, reorder=False)
Beispiel #4
0
def roc_log_auc(y_true, y_score, pos_label=None, log_min=0.001, log_max=1.):
    """Computes area under semi-log ROC for random distribution.
    
    Parameters
    ----------
        y_true : array, shape=[n_samples]
            True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined.
        
        y_score : array, shape=[n_samples]
            Scores for tested series of samples
        
        pos_label: int
            Positive label of samples (if other than 1)
        
        log_min : float (default=0.001)
            Minimum logarithm value for estimating AUC
        
        log_max : float (default=1.)
            Maximum logarithm value for estimating AUC.
        
    Returns
    -------
        auc : float
            semi-log ROC AUC
    """
    fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label)
    idx = (fpr >= log_min) & (fpr <= log_max)
    log_fpr = 1-np.log10(fpr[idx])/np.log10(log_min)
    return auc(log_fpr, tpr[idx])
Beispiel #5
0
def roc_log_auc(y_true, y_score, pos_label=None, log_min=0.001, log_max=1.0):
    """Computes area under semi-log ROC for random distribution.
    
    Parameters
    ----------
        y_true : array, shape=[n_samples]
            True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined.
        
        y_score : array, shape=[n_samples]
            Scores for tested series of samples
        
        pos_label: int
            Positive label of samples (if other than 1)
        
        log_min : float (default=0.001)
            Minimum logarithm value for estimating AUC
        
        log_max : float (default=1.)
            Maximum logarithm value for estimating AUC.
        
    Returns
    -------
        auc : float
            semi-log ROC AUC
    """
    fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label)
    idx = (fpr >= log_min) & (fpr <= log_max)
    log_fpr = 1 - np.log10(fpr[idx]) / np.log10(log_min)
    return auc(log_fpr, tpr[idx])
Beispiel #6
0
def roc_log_auc(y_true, y_score, pos_label=None, ascending_score=True, log_min=0.001, log_max=1.):
    """Computes area under semi-log ROC for random distribution.

    Parameters
    ----------
        y_true : array, shape=[n_samples]
            True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined.

        y_score : array, shape=[n_samples]
            Scores for tested series of samples

        pos_label: int
            Positive label of samples (if other than 1)

        ascending_score: bool (default=True)
            Indicates if your score is ascendig. Ascending score icreases with deacreasing activity. In other words it ascends on ranking list (where actives are on top).

        log_min : float (default=0.001)
            Minimum logarithm value for estimating AUC

        log_max : float (default=1.)
            Maximum logarithm value for estimating AUC.

    Returns
    -------
        auc : float
            semi-log ROC AUC
    """
    if ascending_score:
        y_score = -y_score
    fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label)
    idx = (fpr >= log_min) & (fpr <= log_max)
    log_fpr = 1-np.log10(fpr[idx])/np.log10(log_min)
    return auc(log_fpr, tpr[idx], reorder=False)
Beispiel #7
0
def auc_plot_multi(XY,
                   flag="",
                   file="Noname",
                   cutoff=None,
                   show=None,
                   linestyle='rx-.',
                   include_baseline=True,
                   equal_aspect=True):
    """ Method that generates a plot of the ROC curve
             Parameters:
                 title: Title of the chart
                 include_baseline: Add the baseline plot line if it's True
                 equal_aspect: Aspects to be equal for all plot
         """
    import pylab
    from matplotlib import pyplot
    pyplot.figure()
    pylab.clf()
    color_list = ["b", "g", "r", "c", "m", "y", "k", "w"]
    colorid = 0
    for xy in XY:
        if colorid > len(color_list):
            colorid = 0

        x, y = xy[:, 0], xy[:, 1]
        fpr, tpr, thresholds_roc = roc(x, y)
        aucvalue = round(auc(fpr, tpr), 3)

        pylab.plot([x1 for x1 in np.hstack((0, fpr))],
                   [y1 for y1 in np.hstack((0, tpr))],
                   color=color_list[colorid],
                   linewidth=1.0)
        #   pylab.plot([x1 for x1 in precision], [y1 for y1 in recall],color='blue',linewidth=2.0)
        if include_baseline:
            pylab.plot([0.0, 1.0], [0.0, 1.0], 'k--')
        pylab.ylim((0, 1))
        pylab.xlim((0, 1))
        pylab.xticks(pylab.arange(0, 1.1, .1), fontsize=10)
        pylab.yticks(pylab.arange(0, 1.1, .1), fontsize=10)

        #pylab.grid(False,alpha=0.5)
        if equal_aspect:
            cax = pylab.gca()
            cax.set_aspect('equal')
        #pylab.xlabel('1 - Specificity(red)/Precision(blue)')
        pylab.xlabel('1 - Specificity', fontsize=10)
        pylab.ylabel('Sensitivity', fontsize=10)

        pylab.title(flag + ' AUC=' + "%4.3f" % aucvalue + ' N=' +
                    '%1.0f' % len(x))
        colorid += 1
    ax = pylab.gca()
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.get_yaxis().set_tick_params(direction='out')
    ax.get_xaxis().set_tick_params(direction='out')
    pyplot.savefig(file + '_' + flag + '_aucplot.pdf', format='pdf')
    pylab.show()
Beispiel #8
0
def all_stats(labels, scores, cutoff=None):
    if np.unique(labels).shape[0] > 1:
        #print np.unique(labels)
        if np.unique(labels).shape[0] == 2:
            #print len(np.unique(labels))
            fpr, tpr, thresholds_roc = roc(labels, scores)
            precision, recall, thresholds = precision_recall_curve(
                labels, scores)
            precision[np.where(precision == 0)] = 0.000000001
            recall[np.where(recall == 0)] = 0.000000001
            if len(thresholds) > 1:
                F_score = 2 * (precision * recall) / (precision + recall)
                try:
                    if cutoff == None:
                        #cutoff=round(thresholds_roc[np.where(abs(tpr-0.95)==min((abs(tpr-0.95))))][0],5)
                        #print "Calculation cutoff of maximum F-score"
                        cutoff = round(
                            thresholds[np.where(F_score == max(F_score))][0],
                            5)
                    else:
                        print("Using cutoff from previous calculations",
                              cutoff)
                    aucvalue = round(auc(fpr, tpr), 3)
                    cutoff_id = np.where(
                        abs(thresholds_roc -
                            cutoff) == min(abs(thresholds_roc - cutoff)))
                    cutoff_pre_id = np.where(
                        abs(thresholds -
                            cutoff) == min(abs(thresholds - cutoff)))
                    TPR = round(tpr[cutoff_id][0], 5)
                    FPR = round(fpr[cutoff_id][0], 5)
                    PRE = round(precision[cutoff_pre_id][0], 5)
                    stats = aucvalue, TPR, 1 - FPR, len(
                        labels), PRE, cutoff, max(F_score)
                except:
                    stats = float('NaN'), float('NaN'), float('NaN'), len(
                        labels), float('NaN'), float('NaN')
            else:
                stats = float('NaN'), float('NaN'), float('NaN'), len(
                    labels), float('NaN'), float('NaN')
        else:
            gradient, intercept, r_value, p_value, std_err = linregress(
                labels, scores)
            std_err = np.std((labels - scores))
            stats = r_value**2, std_err, gradient, len(labels), p_value, float(
                'NaN')

    else:
        stats = [
            float('NaN'),
            float('NaN'),
            float('NaN'),
            len(labels),
            float('NaN'),
            float('NaN')
        ]
    return np.array(stats)
def self_roc(X, y):
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    import matplotlib.pyplot as plt
    from sklearn.metrics import roc_curve as roc
    fpr, tpr, thresholds = roc(y_test, gs3.predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr)
    plt.plot([0, 1])
Beispiel #10
0
def calc_RH04_auc_train_test(rh03_only=False):
    current_dir = os.getcwd()
    os.chdir(os.path.join('..', 'init_models'))
    models = dict()
    for mod_type in ("ord", "enrich", "bin", "rf"):
        with open("%s_model" % mod_type, 'rb') as model_file:
            models[mod_type] = pickle.load(model_file)
    os.chdir(current_dir)

    model_names = {
        "ord": "ordinal regression",
        "bin": "classification neural network",
        "enrich": "enrichment neural network",
        "rf": "random forest classifier"
    }
    colors = ("red", "blue", "green", "orange")
    for i, mod_type in enumerate(("ord", "enrich", "bin", "rf")):
        current_data = load_data(model_type=mod_type,
                                 return_validation_set=False,
                                 return_rh04=True,
                                 rh03_only=rh03_only)
        xtrain, rh04train, xtest, rh04test = current_data[0], current_data[1], current_data[2],\
                                        current_data[3]
        trainpreds, testpreds = gen_model_spec_preds(models[mod_type],
                                                     mod_type, xtrain, xtest)
        print('Training set %s AUC: %s' %
              (mod_type, auc(rh04train, trainpreds)))
        print('Test set %s AUC: %s' % (mod_type, auc(rh04test, testpreds)))
        fpr_ord, tpr_ord, _ = roc(rh04test, testpreds)
        equal_spec = np.arange(1, -0.1, -0.1)

        plt.plot(fpr_ord,
                 tpr_ord,
                 color=colors[i],
                 label=model_names[mod_type])
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
    if rh03_only == True:
        plt.title(
            'AUC-ROC for RH04 prediction on the held-out test set'
            '\nusing models trained on RH01 - RH03 data only;\nAUC-ROC of ranking for RH03 sequences'
        )
    else:
        plt.title(
            'AUC-ROC for RH04 prediction on the held-out test set'
            '\nusing models trained on RH01 - RH03 data only;\nAUC-ROC of ranking for RH01 - RH03 sequences'
        )

    plt.legend()
    plt.plot(equal_spec, equal_spec, color='black', linestyle='--')
    plt.show()
 def get_results(ins, oos): #in/out of sample
     """
     returns AOROC, AOPR (success), AOPR (failure) 
     """
     rval = []
     y_true = np.hstack((np.ones(len(ins)), np.zeros(len(oos))))
     y_score = np.hstack((ins, oos))
     rval += [round(roc(y_true, y_score)*100, 2),
             round(pr(y_true, y_score)*100, 2)]
     y_true = np.hstack((np.zeros(len(ins)), np.ones(len(oos))))
     y_score = -y_score
     rval += [#round(roc(y_true, y_score)*100, 2),
             round(pr(y_true, y_score)*100, 2)]
     return rval
Beispiel #12
0
def evaluate(y_test, y_pred):
    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    #f1 score
    from sklearn.metrics import f1_score
    fscore = f1_score(y_test, y_pred)
    print("F1-Score : ", fscore)

    # Finding Area Under ROC curve
    from sklearn.metrics import roc_auc_score as roc
    a = roc(y_test, y_pred, average='micro')
    print("ROC-AUC Score : ", a)
    return (cm, fscore, a)
Beispiel #13
0
def analyze(classifier, X_val, y_val, prc_ax, roc_ax, **params):

    # y_predict = classifier.predict(X_val)
    if params['model'] is 'svm' or params['model'] is 'logistic':
        y_predict = classifier.decision_function(X_val)
    else:
        y_predict = classifier.predict_proba(X_val)[:, 1]

    # Accuracy
    accuracy = classifier.score(X_val, y_val)

    # Precision-Recall
    auprc = prc_score(y_val, y_predict)
    precision, recall, thresholds = prc(y_val, y_predict)
    prc_ax.plot(recall, precision, label='AUC={}'.format(auprc))

    # Receiver Operating Characteristics
    fpr, tpr, thr = roc(y_val, y_predict, pos_label=1)
    auroc = roc_score(fpr, tpr)
    roc_ax.plot(fpr, tpr, label='AUC={}'.format(auroc))

    return accuracy, auprc, auroc
Beispiel #14
0
def get_AOC(ins, oos):  #in/out of sample
    # TODO: order of the last 2???
    """
    returns AOROC, AOPR (success), AOPR (failure) 
    """
    rval = []
    y_true = np.hstack((np.ones(len(ins)), np.zeros(len(oos))))
    y_score = np.vstack((ins, oos))
    # TODO: use different scores (e.g. entropy, acq fns)
    y_score = y_score.max(axis=1)
    #print y_score
    #import ipdb; ipdb.set_trace()
    rval += [
        round(roc(y_true, y_score) * 100, 2),
        round(pr(y_true, y_score) * 100, 2)
    ]
    y_true = np.hstack((np.zeros(len(ins)), np.ones(len(oos))))
    y_score = -y_score
    rval += [  #round(roc(y_true, y_score)*100, 2),
        round(pr(y_true, y_score) * 100, 2)
    ]
    return rval
Beispiel #15
0
def compute_eval_stats(classifier, y_data, rankings, threshold):
    ''' Takes: classifier object, true target data, predicted score rankings, 
                ranking threshold cutoff
        Returns: accuracy, precision, recall of predictions of classifier on x for y
    '''

    predicted_test = np.where(rankings < threshold, 1, 0)

    # print(threshold)
    # print(predicted_test.sum())
    # print(predicted_test[0:10])
    # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape)
    # print("eval stats rankings are: ", rankings[0:10])

    stats = [
        accuracy(y_data, predicted_test),
        precision(y_data, predicted_test),
        recall(y_data, predicted_test),
        f1(y_data, predicted_test),
        roc(y_data, predicted_test)
    ]

    return stats
def train():
    data_x, data_y = load_tensors()

    sizedata = len(data_x)
    print("Data of size:", sizedata)
    # Split dataset into 5 sub-datasets
    splitted_x = list(split(data_x, 5))
    splitted_y = list(split(data_y, 5))
    print("Available GPU :", torch.cuda.is_available())
    torch.cuda.set_device(0)
    k = ARGS.kFold

    # Prepare array of scores
    precision_list = []
    recall_list = []
    # valloss_list = []
    AUC_list = []
    for ind_i in range(0,k):
        # Prepare X_train Y_train X_test Y_test
        X_test = splitted_x[ind_i]
        Y_test = splitted_y[ind_i]
        # Deep copy, otherwise iteration problem
        copysplitX = list(splitted_x)
        copysplitY = list(splitted_y)
        del copysplitX[ind_i]
        del copysplitY[ind_i]
        X_train = copysplitX
        Y_train = copysplitY
        model = Network().cuda()
        # XAVIER Init
        model.apply(init_weights)
        with torch.cuda.device(0):
            # Hyperparameters :
            epochs = ARGS.nEpochs
            batchsize = ARGS.batchSize
            learning_rate = ARGS.lr
            log_interval = 2
            criterion = nn.BCEWithLogitsLoss()
            # criterion = nn.BCELoss()
            # criterion = nn.CrossEntropyLoss()
            optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            # optimizer = optim.Adam(model.parameters(), lr=learning_rate)

            # Train loader
            numplist = np.array(X_train)
            arrX = np.concatenate(numplist).tolist()
            tensor_x = torch.Tensor(arrX).cuda()
            numplist = np.array(Y_train)
            arrY = np.concatenate(numplist).tolist()
            tensor_y = torch.Tensor(arrY).cuda()
            print("Shape X:", np.shape(arrX), "Shape Y:", np.shape(arrY))
            # tensor_x = torch.Tensor(np.array(X_train).tolist()).cuda()  # transform to torch tensor
            # tensor_y = torch.Tensor(np.array(Y_train).tolist()).cuda()
            dataset = dt.TensorDataset(tensor_x, tensor_y)  # create your dataset
            # train_size = int(len(dataset))
            # print("train_size =", train_size)
            train_loader = dt.DataLoader(
                dataset,
                batch_size=batchsize,
                shuffle=True)

            # Test loader
            tensor_x = torch.Tensor(np.array(X_test).tolist()).cuda()  # transform to torch tensor
            tensor_y = torch.Tensor(np.array(Y_test).tolist()).cuda()
            dataset = dt.TensorDataset(tensor_x, tensor_y)  # create your dataset
            test_loader = dt.DataLoader(
                dataset,
                batch_size=batchsize,
                shuffle=True)

            # Training
            for epoch in range(epochs):
                for batch_idx, (data, target) in enumerate(train_loader):
                    data, target = Variable(data), Variable(target)
                    optimizer.zero_grad()
                    net_out = model(data)
                    loss = criterion(net_out, target)
                    loss.backward()
                    optimizer.step()
                    if batch_idx % log_interval == 0:
                        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: '.format(
                            epoch, batch_idx * len(data), len(train_loader.dataset),
                                   100. * batch_idx / len(train_loader)))
                        print(loss.data)

            # saving model
            # torch.save(model.state_dict(), ARGS.outFile + str(ind_i))

            # Testing and save score
            total = 0
            correct = 0
            model.eval()
            # Validation loss
            # loss_values = []
            itr_ctr = 0
            for batch_idx, (data, target) in enumerate(test_loader):
                #with torch.no_grad():
                itr_ctr += 1
                data, target = Variable(data, volatile=True), Variable(target, volatile=True)
                net_out = model(data)
                loss = criterion(net_out, target)
                # loss_values.append(loss)

            # Validation Loss in the list
            # valloss_list.append(np.mean(loss_values))

            P = list()
            R = list()
            # Precisions
            for i in range(1,4):
                for data in test_loader:
                    x, labels = data
                    outputs = model(Variable(x)).detach() # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes]
                    _, predicted = torch.topk(outputs.data, i)
                    for y_predlist, y in zip(predicted, labels):
                        for y_pred in y_predlist:
                            total += 1
                            if y[y_pred] == 1:
                                correct += 1

                precision = correct / total
                P.append(precision)
                correct = 0
                total = 0

            # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate)
            total_true_list = list()
            for data in test_loader:
                x, labels = data
                for y in labels :
                    total_true = 0
                    for val in y :
                        if val == 1:
                            total_true += 1
                    total_true_list.append(total_true)

            # Recalls
            for i in range(10,40,10):
                total_true_list_cpy = list(total_true_list)
                for data in test_loader:
                    x, labels = data
                    outputs = model(Variable(x)).detach()
                    _, predicted = torch.topk(outputs.data, i)
                    for y_predlist, y in zip(predicted, labels):
                        total += total_true_list_cpy.pop(0)
                        for y_pred in y_predlist:
                            if y[y_pred] == 1:
                                correct += 1

                recall = correct / total
                R.append(recall)
                correct = 0
                total = 0
            precision_list.append(P)
            recall_list.append(R)

            # AUROC
            YTRUE = None
            YPROBA = None
            for data in test_loader:
                x, labels = data
                x, labels = Variable(x), Variable(labels)
                outputs = model(x).detach().cpu().numpy()
                labels = labels.detach().cpu().numpy()
                for batch_true, batch_prob in zip(labels, outputs):
                    YTRUE = np.concatenate((YTRUE, [batch_true]), axis=0) if YTRUE is not None else [batch_true]
                    YPROBA = np.concatenate((YPROBA, [batch_prob]), axis=0) if YPROBA is not None else [batch_prob]
            ROC_avg_score=roc(YTRUE, YPROBA, average='micro', multi_class='ovr')
            AUC_list.append(ROC_avg_score)

    # Output score of each fold + average
    print("Scores for each fold:")
    print("Precision:", precision_list)
    print("Recall:", recall_list)
    print("AUROC:", AUC_list)
    # print("Loss:", valloss_list)
    print("Average scores:")
    P1=(sum([precision_list[k][0] for k in range(0, k)])/k)
    P2=(sum([precision_list[k][1] for k in range(0, k)])/k)
    P3=(sum([precision_list[k][2] for k in range(0, k)])/k)
    R10=(sum([recall_list[k][0] for k in range(0, k)])/k)
    R20=(sum([recall_list[k][1] for k in range(0, k)])/k)
    R30=(sum([recall_list[k][2] for k in range(0, k)])/k)
    AUROC=(sum([AUC_list[k] for k in range(0,k)])/k)
    # loss_avg=sum(valloss_list)/len(valloss_list)
    print("Precision@1:", P1)
    print("Precision@2:", P2)
    print("Precision@3:", P3)
    print("Recall@10:", R10)
    print("Recall@20:", R20)
    print("Recall@30:", R30)
    print("AUROC:", AUROC)
cm = confusion_matrix(y_test, y_pred)

# Finding Area Under ROC curve
#from sklearn.metrics import roc_auc_score as roc
#a = roc(y_test, y_pred, average='micro')

from sklearn.metrics import roc_auc_score as roc
from sklearn.preprocessing import label_binarize
print(y_test)
y_test= label_binarize(y_test,classes=[0,1])
y_pred= label_binarize(y_pred,classes=[0,1])
print(y_test)
#y_test=y_test.reshape(-1, 1)
#y_pred=y_test.reshape(1, -1)
#print(y_test)
a = roc(y_test, y_pred, average='micro')

####################################################################

#Processing data for plotting graph

from sklearn.preprocessing import label_binarize
y = label_binarize(y, classes=[0, 1])
n_classes = y.shape[1]

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
y = onehotencoder.fit_transform(y).toarray()

y_test = label_binarize(y_test, classes=[0, 1])
onehotencoder = OneHotEncoder(categorical_features = [0])
Beispiel #18
0
def main():
    X, y = rr('kddcup.data_10_percent')
    X, X_test, y, y_test = train_test_split(X, y, random_state=75)
    y = np.array([0 if i != 11 else 1 for i in y])
    yy = np.array([0 if i != 11 else 1 for i in y_test])
    random_state = np.random.RandomState(0)
    kdd3 = SVC()
    kdd2 = C4_5(random_state=0)
    kdd1 = NB()
    kdd = RF(n_estimators=100, random_state=0, max_features=2)
    p3 = kdd3.fit(X, y)
    p2 = kdd2.fit(X, y)
    p1 = kdd1.fit(X, y)
    p = kdd.fit(X, y)
    t = p.predict(X_test)
    t1 = p1.predict(X_test)
    t2 = p2.predict(X_test)
    t3 = p3.predict(X_test)
    print("Random Forest")
    rn(t, yy)
    print("Native Base")
    rn(t1, yy)
    print("C4.5")
    rn(t2, yy)
    print("SVC")
    rn(t3, yy)

    # Построение матрицы ошибок
    cm3 = confusion_matrix(yy, t3)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm3)
    disp.plot()
    plt.show()
    cm2 = confusion_matrix(yy, t2)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm2)
    disp.plot()
    plt.show()
    cm1 = confusion_matrix(yy, t1)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm1)
    disp.plot()
    plt.show()
    cm = confusion_matrix(yy, t)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()

    # Построение AUC-ROC кривых
    pr3, rc3, _ = precision_recall_curve(yy, t3)
    fpr, tpr, _ = roc(yy, p3.decision_function(X_test))
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('SVM', roc_auc))

    pr2, rc2, _ = precision_recall_curve(yy, t2)
    fpr, tpr, _ = roc(yy, p2.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('C4.5', roc_auc))

    pr1, rc1, _ = precision_recall_curve(yy, t1)
    fpr, tpr, _ = roc(yy, p1.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('NB', roc_auc))

    pr, rc, _ = precision_recall_curve(yy, t)
    fpr, tpr, _ = roc(yy, p.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr,
             tpr,
             linestyle='--',
             label='%s ROC (area = %0.2f)' % ('RF', roc_auc))

    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.legend(loc=0, fontsize='small')
    plt.show()
    plt.plot(rc3, pr3, label='SVM')
    plt.plot(rc2, pr2, label='C4.5')
    plt.plot(rc1, pr1, label='NB')
    plt.plot(rc, pr, label='RF')
    plt.legend(loc=0, fontsize='small')
    plt.show()
Beispiel #19
0
def main():
    # Prepare data
    df_even = pd.read_csv('DNN/adversary_even_log.csv', index_col=0)
    df_even = df_even.loc[df_even['Class']==0]
    df_even = df_even.loc[df_even['generator']!=2]
    df_even = df_even.loc[df_even['nTags']==2]

    df_odd = pd.read_csv('DNN/adversary_odd_log.csv', index_col=0)
    df_odd = df_odd.loc[df_odd['Class']==0]
    df_odd = df_odd.loc[df_odd['generator']!=2]
    df_odd = df_odd.loc[df_odd['nTags']==2]
    
    df_odd[variables] = scale(df_odd[variables])
    df_even[variables] = scale(df_even[variables])
    
    # construction of bdt
    bdt_even = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=0.01),
                                          learning_rate=learning_rate,
                                          algorithm="SAMME",
                                          n_estimators=n_estimators
                                          )
    bdt_even.n_classes_ = 2
    bdt_odd = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=0.01),
                                         learning_rate=0.15,
                                         algorithm="SAMME",
                                         n_estimators=n_estimators
                                         )
    bdt_odd.n_classes_ = 2
    
    # Convert generator class to categorical
    z_even = to_categorical(df_even['generator'], num_classes=2)
    z_odd = to_categorical(df_odd['generator'], num_classes=2)
    
    # fitting to generators
    bdt_even.fit(df_even[variables], df_even['generator'], sample_weight=df_even['EventWeight'])
    bdt_odd.fit(df_odd[variables], df_odd['generator'], sample_weight=df_odd['EventWeight'])
    
    # Scoring
    df_odd['bdt_outcome'] = bdt_odd.decision_function(df_odd[variables]).tolist()
    df_even['bdt_outcome'] = bdt_even.decision_function(df_even[variables]).tolist()

    print(bdt_odd.score())
    df = pd.concat([df_odd,df_even])
    
    # plotting BDT outcome for different generators
    gen1 = df.loc[df['generator']==0]
    gen2 = df.loc[df['generator']==1]
    
#    plt.hist(gen1['bdt_outcome'],bins=70,color='red',alpha=0.5,density=True)
#    plt.hist(gen2['bdt_outcome'],bins=70,color='blue',alpha=0.5,density=True)
#    
#    plt.show()
    
    # calculating fpr, tpr and auc for roc curve
    fpr = dict()
    tpr = dict()
    area = dict()
    
    z_all = to_categorical(df['generator'], num_classes=2)

    fpr[0], tpr[0], _ = roc(z_all[:,0], df['bdt_outcome'],sample_weight=df['EventWeight'])
#    area[0] = auc(fpr[0], tpr[0])
    
    # plotting the roc curve
#    plt.plot(fpr[0], tpr[0], color='darkorange',lw=1,label='PYTHIA, ROC curve (area = %0.2f)' % area[0])
#    plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
#    plt.legend()
#    plt.savefig('roc.png', bbox_inches='tight')
#    plt.show(block=True)

    print('It runs. ')
Beispiel #20
0
def test():
    # Load the test data
    X_test = pickle.load(open(ARGS.Xinputdata, 'rb'))
    Y_test = pickle.load(open(ARGS.Yinputdata, 'rb'))
    ARGS.inputdim = len(X_test[0][0])
    ARGS.numberOfOutputCodes = len(Y_test[0][0])
    print("Dataset with :", len(X_test), "patients. Y:", len(Y_test))
    print("Each patient has :", len(X_test[0]), "admissions. Y:", len(Y_test[0]))
    X_test=np.array([np.array([np.array(unelist, dtype=np.uint8) for unelist in xi]) for xi in X_test])
    Y_test=np.array([np.array([np.array(unelist, dtype=np.uint8) for unelist in xi]) for xi in Y_test])
    tensor_x = torch.from_numpy(X_test)
    tensor_y = torch.from_numpy(Y_test)
    print("X_dataset_shape=",tensor_x.shape)
    print("Y_dataset_shape=",tensor_y.shape)
    dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset
    batchsize = ARGS.batchSize
    test_loader = dt.DataLoader(
        dataset,
        batch_size=batchsize,
        shuffle=False)

    # Load the model
    model = Network()
    model.load_state_dict(torch.load(ARGS.inputModel))
    # for param_tensor in model.state_dict():
    #     print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    total = 0
    correct = 0
    model.eval()
    h = model.init_hidden()
    P = list()
    R = list()
    # Precisions
    for i in range(1,4):
        for (data, targets) in test_loader:
            x, labels = Variable(data.float()), Variable(targets.float())
            # output is a tensor of size [BATCHSIZE][#ADMISSIONS][ARGS.numberOfOutputCodes]
            if (x.size(0) != ARGS.batchSize):
                continue
            outputs, h = model(x, h)
            _, predicted = torch.topk(outputs.data, i)
            for y_predlist_adm, y_adm in zip(predicted, targets):
                for y_predlist, y in zip(y_predlist_adm, y_adm):
                    # If y is a tensor with only zeros (padding), break this loop
                    if torch.max(y) != 1 :
                        break
                    for y_pred in y_predlist:
                        total += 1
                        if y[y_pred] == 1:
                            correct += 1

        precision = correct / total
        print("P@", i, "=", precision)
        P.append(precision)
        correct = 0
        total = 0

    # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate)
    total_true_list = list()
    h = model.init_hidden()
    for (data, targets) in test_loader:
        x, labels = Variable(data.float()), Variable(targets.float())
        if (x.size(0) != ARGS.batchSize):
            continue
        outputs, h = model(x, h)
        for y_adm in targets :
            for y in y_adm :
                if torch.max(y) != 1 :
                    break
                total_true = 0
                for val in y :
                    if val == 1:
                        total_true += 1
                total_true_list.append(total_true)

    # recall
    h = model.init_hidden()
    for i in range(10,40,10):
        total_true_list_cpy = list(total_true_list)
        for (data, targets) in test_loader:
            x, labels = Variable(data.float()), Variable(targets.float())
            if (x.size(0) != ARGS.batchSize):
                continue
            outputs, h = model(x,h)
            _, predicted = torch.topk(outputs.data, i)
            for y_predlist_adm, y_adm in zip(predicted, targets):
                for y_predlist, y in zip(y_predlist_adm, y_adm):
                    if torch.max(y) != 1 :
                        break
                    total += total_true_list_cpy.pop(0)
                    for y_pred in y_predlist:
                        if y[y_pred] == 1:
                            correct += 1

        recall = correct / total
        print("R@", i, "=", recall)
        R.append(recall)
        correct = 0
        total = 0

    # AUROC
    YTRUE = None
    YPROBA = None
    h = model.init_hidden()
    for (data, targets) in test_loader:
        x, labels = Variable(data.float()), Variable(targets.float())
        if x.size(0) != ARGS.batchSize:
            continue
        outputs, h = model(x, h)
        outputs = outputs.detach().cpu().numpy()
        x = x.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        for batch_true, batch_prob in zip(labels, outputs):
            for adm_true, adm_prob in zip(batch_true, batch_prob):
                if torch.max(torch.from_numpy(adm_true)) != 1:
                    break
                YTRUE = np.concatenate((YTRUE, [adm_true]), axis=0) if YTRUE is not None else [adm_true]
                YPROBA = np.concatenate((YPROBA, [adm_prob]), axis=0) if YPROBA is not None else [adm_prob]
    ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr')
    print("ROC Average Score:", ROC_avg_score)
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 21 15:43:55 2017

@author: ViniciusPantoja
"""

#%%

# After runnig the data_adjustments file, this code will deliver the final
# predictions

from sklearn.linear_model import LogisticRegression as lr
from sklearn.metrics import roc_auc_score as roc

model = lr(penalty='l2', fit_intercept=False, n_jobs=-1)

resultado = model.fit(X_train, Y_train)

prediction = resultado.predict(X_test)

roc(Y_test, prediction)

final_prediction = resultado.predict(Y)
Beispiel #22
0
xgtrain = xgb.DMatrix(Xdata,label=Ydata)
xgmodel = XGBClassifier_new(n_estimators=550,seed=1,learning_rate=0.1,objective='binary:logistic',nthread=-1)
xgb_param = xgmodel.get_xgb_params()
cvresult = xgb.cv(xgb_param,xgtrain, num_boost_round=xgmodel.get_params()['n_estimators'],nfold=4,metrics=['auc'],
                 early_stopping_rounds=100, show_progress=True)

#cross_validation = StratifiedKFold(Ydata, n_folds=4, shuffle=True,random_state=0)
#predicted = cross_val_predict(xgmodel, Xdata, Ydata, cv = cross_validation, verbose= 1, n_jobs = 1)

X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size=0.25, random_state=1)

xgmodel.fit(X_train, y_train)
ypred = xgmodel.predict_proba(X_test)

print roc(y_test, ypred[:,1])

#print ypred[:,1]

xgmodel.fit(Xdata, Ydata)

test_df_old['predicted'] = xgmodel.predict_proba(Xtest)[:,1]

#print test_df_old

test_df_old.to_csv('submission_xgb.csv',index=False)

merged_df['predicted'] = xgmodel.predict_proba(Xdata)[:,1]
merged_df['Ytrue'] = ytrue

#print roc(ytrue, merged_df['predicted'])
Beispiel #23
0
#view the results
#Usage: python results.py resultfile

import pandas as pd
import sys
from sklearn.metrics import accuracy_score, roc_auc_score as roc, auc, classification_report, balanced_accuracy_score, matthews_corrcoef as mcc
df = pd.read_csv(
    sys.argv[1],
    header=None,
    sep=',',
)
y_true = df[2]
y_pred = df[3]
y_predprob = df[4]
#print(y_test)
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_true, y_pred))
#print(auc(y_true, y_predprob))
print(classification_report(y_true, y_pred))
print("MCC:", mcc(y_true, y_pred))
print("ROC_AUC:", roc(y_true, y_predprob))
def test():
    # Load the test data
    X_test = pickle.load(open(ARGS.Xinputdata, 'rb'))
    Y_test = pickle.load(open(ARGS.Yinputdata, 'rb'))
    criterion = nn.BCEWithLogitsLoss()
    ARGS.inputdim = len(X_test[0])
    ARGS.numberOfOutputCodes = len(Y_test[0])
    print("X_test of len:", len(X_test), "and Y_test of len:", len(Y_test))
    print("Samples of X's len:", len(X_test[0]), "and samples of Y's len:",
          len(Y_test[0]))
    tensor_x = torch.Tensor(np.array(X_test))  # transform to torch tensor
    print("X_dataset_shape=", tensor_x.shape)
    tensor_y = torch.Tensor(np.array(Y_test))
    print("Y_dataset_shape=", tensor_y.shape)
    dataset = dt.TensorDataset(tensor_x, tensor_y)  # create your dataset
    test_size = int(len(dataset))
    print("test_size =", test_size)
    batchsize = ARGS.batchSize
    test_loader = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False)

    # Load the model
    model = Network()
    model.load_state_dict(
        torch.load(ARGS.inputModel, map_location=torch.device('cpu')))
    # for param_tensor in model.state_dict():
    #     print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    total = 0
    correct = 0
    model.eval()

    # validation loss
    loss_values = []
    itr_ctr = 0
    for batch_idx, (data, target) in enumerate(test_loader):
        with torch.no_grad():
            itr_ctr += 1
            data, target = Variable(data), Variable(target)
            net_out = model(data)
            loss = criterion(net_out, target)
            loss_values.append(loss)

    print("Validation loss :", np.mean(loss_values))

    P = list()
    R = list()
    # Precisions
    for i in range(1, 4):
        for data in test_loader:
            x, labels = data
            outputs = model(
                x
            )  # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes]
            # _, predicted = torch.max(outputs.data, 1)
            _, predicted = torch.topk(outputs.data, i)
            for y_predlist, y in zip(predicted, labels):
                for y_pred in y_predlist:
                    total += 1
                    if y[y_pred] == 1:
                        correct += 1

        precision = correct / total
        print("P@", i, "=", precision)
        P.append(precision)
        correct = 0
        total = 0

    # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate)
    total_true_list = list()
    for data in test_loader:
        x, labels = data
        outputs = model(x)
        for y in labels:
            total_true = 0
            for val in y:
                if val == 1:
                    total_true += 1
            total_true_list.append(total_true)

    # Recalls
    for i in range(10, 40, 10):
        total_true_list_cpy = list(total_true_list)
        for data in test_loader:
            x, labels = data
            outputs = model(x)
            _, predicted = torch.topk(outputs.data, i)
            for y_predlist, y in zip(predicted, labels):
                total += total_true_list_cpy.pop(0)
                for y_pred in y_predlist:
                    if y[y_pred] == 1:
                        correct += 1

        recall = correct / total
        print("R@", i, "=", recall)
        R.append(recall)
        correct = 0
        total = 0

    # AUC score
    AUC_list = list()
    YTRUE = None
    YPROBA = None
    for data in test_loader:
        x, labels = data
        x, labels = Variable(x), Variable(labels)
        outputs = model(x).detach().numpy()
        labels = labels.detach().numpy()
        # roc_score=roc(labels, outputs, average='micro', multi_class='ovr')
        # AUC_list.append(roc_score)
        for batch_true, batch_prob in zip(labels, outputs):
            YTRUE = np.concatenate(
                (YTRUE,
                 [batch_true]), axis=0) if YTRUE is not None else [batch_true]
            YPROBA = np.concatenate(
                (YPROBA, [batch_prob]),
                axis=0) if YPROBA is not None else [batch_prob]
    # ROC_avg_score=sum(AUC_list)/len(AUC_list)
    ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr')
    print("ROC Average Score:", ROC_avg_score)