Example #1
0
def evaluate(model,
             iterator_function,
             _batch_count,
             cuda_device,
             output_buffer=sys.stderr):
    if output_buffer is not None:
        print(_batch_count, file=output_buffer)
    model.eval()
    with torch.no_grad():
        predictions = []
        expectations = []
        batch_generator = range(_batch_count)
        if output_buffer is not None:
            batch_generator = tqdm(batch_generator)
        for _ in batch_generator:
            features, targets = iterator_function()
            if cuda_device != -1:
                features = features.cuda(device=cuda_device)
            probs, _, _ = model(example_batch=features)
            batch_pred = np.argmax(probs.detach().cpu().numpy(),
                                   axis=-1).tolist()
            batch_tgt = targets.detach().cpu().numpy().tolist()
            predictions.extend(batch_pred)
            expectations.extend(batch_tgt)
        model.train()
        return acc(expectations, predictions) * 100, \
               pr(expectations, predictions) * 100, \
               rc(expectations, predictions) * 100, \
               f1(expectations, predictions) * 100,
Example #2
0
def build_classifier_and_test(train_X,
                              train_y,
                              test_X,
                              test_y,
                              clf,
                              print_train_result=True):
    clf.fit(train_X, train_y)
    if print_train_result == True:
        p_tr = clf.predict(train_X)
        print("Train Accuracy:\t", acc(train_y, p_tr))
        print("Train Precision:\t", pr(train_y, p_tr))
        print("Train Recall_score:\t", rc(train_y, p_tr))
        print("Train F-score:\t", f1(train_y, p_tr))
    predicted = clf.predict(test_X)
    print("Accuracy:\t", acc(test_y, predicted))
    print("Precision:\t", pr(test_y, predicted))
    print("Recall_score:\t", rc(test_y, predicted))
    print("F-score:\t", f1(test_y, predicted))
Example #3
0
def clone_analysis(data_paths):
    code = []
    labels = []
    positives = 0
    for file_name in data_paths:
        data = json.load(open(file_name))
        for example in data:
            code.append(example['tokenized'])
            l = 0
            if 'label' in example.keys():
                l = int(example['label'])
            elif 'lebel' in example.keys():
                l = int(example['lebel'])
            elif 'leble' in example.keys():
                l = int(example['leble'])
            elif 'lable' in example.keys():
                l = int(example['lable'])
            if l > 1:
                l = 1
            positives += l
            labels.append(l)
    print(len(code), len(labels), positives, len(labels) - positives)
    vectorizer = TfidfVectorizer(input=code,
                                 lowercase=False,
                                 ngram_range=(1, 3))
    X = vectorizer.fit_transform(code)
    model = KMeans(n_clusters=10, max_iter=100)
    model.fit(X)
    y = model.predict(X)
    cluster_to_positive = [0] * 10
    cluster_to_negative = [0] * 10
    for pred, label in zip(y, labels):
        if label == 1:
            cluster_to_positive[pred] += 1
        else:
            cluster_to_negative[pred] += 1
    print(cluster_to_positive)
    print(cluster_to_negative)
    percentages = [
        float(p) / (p + n)
        for p, n in zip(cluster_to_positive, cluster_to_negative)
    ]
    for p in percentages:
        print(p)
    for _ in range(5):
        XTrain, XTest, YTrain, YTest = train_test_split(X,
                                                        labels,
                                                        test_size=0.2)
        model = RandomForestClassifier()
        model.fit(XTrain, YTrain)
        predicted = model.predict(XTest)
        print('%.3f\t%.3f\t%.3f\t%.3f' %
              (acc(YTest, predicted) * 100, pr(YTest, predicted) * 100,
               rc(YTest, predicted) * 100, f1(YTest, predicted) * 100))
    pass
Example #4
0
def fit_test(clf, train_tuple, test_tuple):
    '''
    fit_test function that fits a classifier in train_tuple and
    report AUC results on test_tuple
    The tuples should be given as (data, label)
    '''
    data_train, labels_train = train_tuple
    data_test, labels_test = test_tuple
    scaler = StandardScaler()
    scaler.fit(data_train)
    data_train = scaler.transform(data_train)
    data_test = scaler.transform(data_test)

    clf.fit(data_train, labels_train)
    fpr, tpr, _ = rc(labels_test, clf.predict_proba(data_test)[:, 1])
    return auc(fpr, tpr)
Example #5
0
def fit_test(clf, train_tuple, test_tuple):
    '''
    fit_test function that fits a classifier in train_tuple and
    report AUC results on test_tuple
    The tuples should be given as (data, label)
    '''
    data_train, labels_train = train_tuple
    data_test, labels_test = test_tuple
    scaler = StandardScaler()
    scaler.fit(data_train)
    data_train = scaler.transform(data_train)
    data_test = scaler.transform(data_test)

    clf.fit(data_train, labels_train)
    fpr, tpr, _ = rc(labels_test, clf.predict(data_test)[:, 1])
    return auc(fpr, tpr)
Example #6
0
def roc_curve(output, target):
    try:
        from sklearn.metrics import roc_curve as rc
    except ImportError:
        raise RuntimeError("ROC Curve requires scikit-learnto be installed.")

    with torch.no_grad():
        pred = torch.argmax(output, dim=1)
        assert pred.shape[0] == len(target)
        fpr, tpr, _ = rc(target.cpu().numpy(), output[:, 1].cpu().numpy())

    fig = plt.figure()
    plt.plot(fpr, tpr)
    fig.canvas.draw()

    buf = np.asarray(fig.canvas.buffer_rgba(), dtype=np.uint8)[:, :, :3]
    image = torch.from_numpy(buf).permute(2, 0, 1)

    plt.close(fig)

    return image
Example #7
0
#ROC curves

ind_val = random.sample(range(12000), 2000)

xvalid = xtrain.iloc[ind_val,:]
yvalid = ytrain.iloc[ind_val]

xtrain = xtrain.drop(ind_val)
ytrain = ytrain.drop(ind_val)

xtrain = xtrain.to_numpy()

svc = svm.SVC(C = 10, probability=True)
svc.fit(xtrain, ytrain)
y_score = svc.predict_proba(xvalid)
fpr_svm, tpr_svm, thresholds_svm = rc(yvalid, y_score[:,1])

y_score = svc.predict_proba(xtrain)
fpr_svm_tr, tpr_svm_tr, thresholds_svm = rc(ytrain, y_score[:,1])

plt.figure(0)
plt.title('ROC Curves Heldout Set')
plt.plot(fpr_svm, tpr_svm, label = 'Support Vector Machine')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend()
plt.savefig('ROC_Curves.jpg') 

###########################################################################################3
# Print out a few examples of photos that we are misclasified by my model in order to learn from
Example #8
0
    Returns
    -------
    WRITEME
    """

    assert classifier is not None, "Why would you pass not classifier?"

    # Data scaling based on training set
    scaler = StandardScaler()
    scaler.fit(data[train_idx])
    data_train = scaler.transform(data[train_idx])
    data_test = scaler.transform(data[test_idx])

    classifier.fit(data_train, labels[train_idx])

    fpr, tpr, thresholds = rc(labels[test_idx],
                              classifier.predict_proba(data_test)[:, 1])

    return auc(fpr, tpr)


def load_data(source_dir, data_pattern):
    """
    Loads the data from multiple sources if provided.

    Parameters
    ----------
    source_dir: str
    data_pattern: str

    Returns
    -------
Example #9
0
    # test model
    features_t = features[test_index]
    test_op = label[test_index]
    #true_op.append(test_op)
    pred_hsv = model_hsv.predict(features_t)
    score_hsv = model_hsv.score(features_t, test_op)
    pred_ssv = model_ssv.predict(features_t)
    score_ssv = model_ssv.score(features_t, test_op)
    acc_h.append(score_hsv)
    acc_s.append(score_ssv)
    print 'Time spent in each fold:'
    print time.time() - start_time

# plot ROC
y_score_hsv = model_hsv.decision_function(features_t)
fpr_h, tpr_h, _ = rc(test_op, y_score_hsv)
y_score_ssv = model_ssv.decision_function(features_t)
fpr_s, tpr_s, _ = rc(test_op, y_score_ssv)
fig1 = plt.figure()
lw = 1
plt.plot(fpr_h, tpr_h, color='darkorange',
         lw=lw, label='ROC curve (Soft SVM)')
plt.plot(fpr_s, tpr_s, color='deeppink', lw=lw,
         label='ROC curve (Hard SVM)')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
Example #10
0
                                                            negative_batch=x_n)
        repr = representation.detach().cpu().numpy()
        prediction_classes = np.argmax(prediction_prob.detach().cpu().numpy(),
                                       axis=-1)
        # print(
        #     "Epoch %3d, Loss: %10.4f, Accuracy: %5.2f, Precision: %5.2f, Recall: %5.2f, F1: %5.2f" % (
        #         epoch, batch_loss.detach().cpu().item(),
        #         acc(targets, prediction_classes), pr(targets, prediction_classes),
        #         rc(targets, prediction_classes), f1(targets, prediction_classes)
        #     )
        # )
        if epoch % 1 == 0:
            prediction_prob, representation, batch_loss = model(
                example_batch=test_x, targets=test_y)
            repr = representation.detach().cpu().numpy()
            prediction_classes = np.argmax(
                prediction_prob.detach().cpu().numpy(), axis=-1)
            print('=' * 100)
            print(
                "Test  %3d, Loss: %10.4f, Accuracy: %5.2f, Precision: %5.2f, Recall: %5.2f, F1: %5.2f"
                % (epoch, batch_loss.detach().cpu().item(),
                   acc(test_y,
                       prediction_classes), pr(test_y, prediction_classes),
                   rc(test_y,
                      prediction_classes), f1(test_y, prediction_classes)))
            print('=' * 100)
            plot_embedding(repr, test_y, title='Epoch %d' % epoch)
        batch_loss.backward()
        optimizer.step()
    pass
Example #11
0
#its double the original size 2125350, which is what we want


#generate score - has 2 modes. By default generates the probabilities, but adding another argument
def gs(x, y="prob"):
    #70/30 train test split
    x_train, x_test, y_train, y_test = tts(x, x.label, test_size=0.3)
    data = x_train.iloc[:, :32]
    test_data = x_test.iloc[:, :32]

    #train model
    classifier = lr(random_state=0).fit(data, y_train)
    if y == "prob":
        pred = classifier.predict_proba(test_data)
    else:
        pred = classifier.predict(test_data)
    return pred, y_test.values


print(gs(finalSet))

#AUC Curve

a = gs(finalSet, "pred")
print(auc(a[0], a[1]))

#Plot it
truePos, falsePos, thresholds = rc(a[1], a[0])
plt.plot(truePos, falsePos)
plt.show()
def start_split_data(data_list):
    random_list = dc(data_list)
    random.shuffle(random_list)
    predicted_list = []
    mark = 0
    acc_list = []
    act_class_list = []
    for i in range(10):  # fold range
        test_list = []
        training_list = []
        while (mark < int(len(random_list))):
            for train_ele in range(0, mark):
                training_list.append(random_list[train_ele])
            else:
                index = mark
                mark = int(len(random_list) / 10) + index
                for test_element in range(index, mark):
                    test_list.append(random_list[test_element])
                for training_element in range(mark, int(len(random_list))):
                    training_list.append(random_list[training_element])
                    # print(training_list)
                    # fold completion
                Node.children = []
                Node.leaf_children = []
                Node.temp_children = []
                Node.new_children = []
                Node.len_training_list = len(training_list)
                Node.old_pessi_err = (node_err_cal(training_list, max_class(
                    training_list, class_column), class_column) + 1) / \
                                     Node.len_training_list
                root = Node(training_list)
                # print(root.data)
                root.node_type = 'root'
                build_tree(root)
                predicted_temp_list = []
                actual_list = []
                temp_root = dc(root)
                for test_element in test_list:
                    actual_list.append(int(test_element[class_column]))
                    found = int(class_finder(test_element, temp_root))
                    predicted_temp_list.append(found)
                    predicted_list.append(found)
                acc_list.append(
                    accuracy(actual_list, predicted_temp_list, class_column))
                break
    print(mean(acc_list))
    act_class_list = class_list_gen(random_list)
    # print(len(act_class_list),len(predicted_list))
    while (len(act_class_list) > len(predicted_list)):
        del act_class_list[-1]
    c_matrix = cm(act_class_list, predicted_list)
    print('Confusion matrix\n', c_matrix)
    c_report = cr(act_class_list, predicted_list)
    print("All Measures required for this data set \n", c_report)
    fpr, tpr, thd = rc(act_class_list, predicted_list)
    roc_auc = auc(fpr, tpr)
    if formula_input == 2:
        plt.title('ROC for %s with information gain(red) and gini(blue)'
                  % file_name[0])
        plt.plot(fpr, tpr,
                 label='%s  AUC = %0.2f' % (formula_measure, roc_auc))
        plt.legend(loc='lower right')
    else:
        plt.title('ROC for %s ' % file_name[0])
        plt.plot(fpr, tpr, label='%s  AUC = %0.2f' % (formula_measure,
                                                      roc_auc))
        plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
        plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
Example #13
0
    Returns
    -------
    WRITEME
    """

    assert classifier is not None, "Why would you pass not classifier?"

    # Data scaling based on training set
    scaler = StandardScaler()
    scaler.fit(data[train_idx])
    data_train = scaler.transform(data[train_idx])
    data_test = scaler.transform(data[test_idx])

    classifier.fit(data_train, labels[train_idx])

    fpr, tpr, thresholds = rc(labels[test_idx],
                              classifier.predict_proba(data_test)[:, 1])

    return auc(fpr, tpr)

def load_data(source_dir, data_pattern):
    """
    Loads the data from multiple sources if provided.

    Parameters
    ----------
    source_dir: str
    data_pattern: str

    Returns
    -------
    data: array_like