def train(self, X, y, val_size=0.2):
     from sklearn.model_selection import train_test_split
     X_train, X_val, y_train, y_val = train_test_split(X,
                                                       y,
                                                       test_size=val_size)
     self.model.fit(X_train, y_train)
     if (self.model_type.split("_")[-1] == "Regressor"):
         from sklearn.metrics import mean_squared_error as mse
         from sklearn.metrics import r2_score as r2
         y_pred_train = self.model.predict(X_train)
         print("Training Scores:")
         print("MSE : " + str(mse(y_train, y_pred_train)))
         print("R-Squared-Score : " + str(r2(y_train, y_pred_train)))
         if (val_size != 0):
             y_pred_val = self.model.predict(X_val)
             print("Validation Scores:")
             print("MSE : " + str(mse(y_val, y_pred_val)))
             print("R-Squared-Score : " + str(r2(y_val, y_pred_val)))
     else:
         from sklearn.metrics import classification_report as cr
         y_pred_train = self.model.predict(X_train)
         print("Training Scores:")
         print("MSE : " + str(cr(y_train, y_pred_train)))
         if (val_size != 0):
             y_pred_val = self.model.predict(X_val)
             print("Validation Scores:")
             print("MSE : " + str(cr(y_val, y_pred_val)))
Example #2
0
        def compute_prf(predictions, true_labels, class_label):

            # file_pred = open('wrong_pred.txt', 'w')
            reverse_label = {}
            for key in class_label:
                reverse_label[class_label[key]] = key

            new_predictions = []
            new_true_labels = []
            try:
                for i in range(len(predictions)):
                    new_predictions.append(reverse_label[predictions[i]])
                    new_true_labels.append(reverse_label[true_labels[i]])
            except:
                pass

            utts = x_text[dev_sample_index:]
            #
            # for i, text in enumerate(utts):
            #     new_label = posttopicmerging(text)
            #     if len(new_label) > 0:
            #         new_predictions[i] = new_label

            # for i, text in enumerate(utts):
            #     if new_predictions[i] != new_true_labels[i]:
            #         file_pred.write(text + '\t' + new_predictions[i] + '\t' + new_true_labels[i])
            #         file_pred.write('\n')

            print cr(new_true_labels, new_predictions, digits=3)
Example #3
0
def classify(model, featureVectors):
    z = model.predict(featureVectors[:, :-1]).astype(
        np.int).reshape(-1).tolist()
    data = featureVectors[:, -1].flatten()
    data = data.astype(np.int).tolist()
    labels = ['DOS', 'Normal', 'Probing', 'R2L', 'U2R']
    print cr(data, z)
Example #4
0
def classify(model, featureVectors):
	z = model.predict(featureVectors[:, :-1]).astype(np.int).reshape(-1).tolist()
	data = featureVectors[:,-1].flatten()
	data = data.astype(np.int).tolist()
	labels = ['DOS', 'Normal', 'Probing', 'R2L', 'U2R']
	print cr(data, z, target_names=labels, digits = 4)
	cm = confusion_matrix(data, z)
	print_cm(cm, labels)
Example #5
0
    def fit(self, X, y, max_epochs=100):
        # uniform labels
        print ("Training is started")
        self.lb = LabelBinarizer()
        y = self.lb.fit_transform(y)
        #print(y)
        # get all sizes

        n_samples, n_features = X.shape
        self.n_outs = y.shape[1]
        #print(self.n_outs)
        n_iterations = int(max_epochs * n_samples)

        # initialize weights #NOTE smart initialization
        nO = np.sqrt(n_features)
        nH = np.sqrt(self.n_hidden)
        self.weights1_ = np.random.uniform(-1/nO, 1/nO, size=(n_features, self.n_hidden))
        self.bias1_ = np.zeros(self.n_hidden)
        self.weights2_ = np.random.uniform(-1/nH, 1/nH, size=(self.n_hidden, self.n_outs))
        self.bias2_ = np.zeros(self.n_outs)
        if self.SGD:
            # NOTE Stochastic Gradient Descent
            # initialize hidden-layer and output layer matrices 
            x_hidden = np.empty((1, self.n_hidden))
            delta_h = np.empty((1, self.n_hidden))
            x_output = np.empty((1, self.n_outs))
            delta_o = np.empty((1, self.n_outs))

            nrange = range(n_samples)
            for it in xrange(1, max_epochs+1):
                np.random.shuffle(nrange)
                for j in nrange:
                    self._forward(X[j, None], x_hidden, x_output)
                    self._backward(X[j, None], y[j, None], x_hidden, x_output, delta_o, delta_h)
                pred = self.predict(xtest)
                #print("p:",pred)
                print("1: ",cr(ytest, pred))

        else:
            # NOTE Gradient Descent
            # initialize hidden-layer and output layer matrices 
            x_hidden = np.empty((n_samples, self.n_hidden))
            delta_h = np.empty((n_samples, self.n_hidden))
            x_output = np.empty((n_samples, self.n_outs))
            delta_o = np.empty((n_samples, self.n_outs))

            # adjust weights by a forward pass and a backward error propagation
            for i in xrange(max_epochs):
                self._forward(X, x_hidden, x_output)
                self._backward(X, y, x_hidden, x_output, delta_o, delta_h)
                pred = self.predict(X)
                print("2: ",cr(y1, pred))
Example #6
0
def classify(model, featureVectors):
	true = 0
	total = 0
	z = []
	for feature in featureVectors:
		if feature[-1] == predict(model, feature[:-1]):
			true += 1
		z = z + predict(model, feature[:-1]).astype(np.int).tolist()
		total += 1
	data = featureVectors[:,-1].flatten()
	data = data.astype(np.int).tolist()
	print cr(data, z)
	print "Accuracy:",
	print (true * 100) / total
Example #7
0
def classify(model, featureVectors):
    true = 0
    total = 0
    z = []
    for feature in featureVectors:
        if feature[-1] == predict(model, feature[:-1]):
            true += 1
        z = z + predict(model, feature[:-1]).astype(np.int).tolist()
        total += 1
    data = featureVectors[:, -1].flatten()
    data = data.astype(np.int).tolist()
    print z
    print cr(data, z)
    print "Accuracy : ",
    print(true * 100) / total
    def print_metrics(self, predicted_output):
        """
        Print some MVP metrics. sklearn is used for calculation of all the
        metric values. Confusion matrix values (true positive, false negative,
        false positive and true negative), precision, recall, f1-score and
        accuracy is calculated. There are few other metrics which comes under
        classification report, but meh to them.

        We need the actual labels and the predicted labels to calculate the
        metrics. We can get the actual labels from the class variable and
        the predicted output or predicted labels are passed as a parameter
        after running each algorithm.

        :param predicted_output: Predicted labels

        """

        res = cm(self.y_test, predicted_output)
        tp = res[0][0]
        fn = res[1][0]
        fp = res[0][1]
        tn = res[1][1]
        print("Accuracy: ", acs(self.y_test, predicted_output))
        print("TP: ", tp, ", FN: ", fn, ", FP: ", fp, "TN: ", tn)
        print(cr(self.y_test, predicted_output))
Example #9
0
def predictResult(betterN, x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = np.array(data2[cols2])

    #quando nao mandar um vaor de betterN, significa que demos o load do modelo
    if betterN > 0:
        knn.n_neighbors = betterN
        knn.fit(x_train, y_train)

        # dump(knn, 'models/knn_teste.joblib')

        prFit = knn.predict(x_test)
        print("predicao: a", prFit)
        print("Matriz de Confusao NB:")
        print(cfm(y_test, prFit))
        print("F1 score NB:")
        print(f1s(y_test, prFit))
        print("Precision score NB:")
        print(ps(y_test, prFit))
        print("Recall score NB:")
        print(rs(y_test, prFit))
        print("Classification Report")
        print(cr(y_test, prFit))

    pr1 = knn.predict(fts2)
    print("predico unica", int(pr1[0]))
    print("predicao unica score")
    print(pr1)
    return pr1
def check(input_file, gold_file):

	original = [line.strip() for line in open(gold_file, 'r')]
	submission = [line.strip() for line in open(input_file, 'r')]
	target_names = ['normal', 'dos', 'r2l', 'u2r', 'probing']
	target_names = ['type1', 'type2', 'type4', 'type5', 'type3']

	for i in range(len(submission)):
		if submission[i] not in target_names:
			submission[i] = ''
	if len(submission) < len(original):
		extra = ['' for x in range(len(original) - len(submission))]
		submission += extra
	elif len(submission) > len(original):
		submission = submission[0:len(original)]
	x = cr(original, submission, digits = 4)
	x = x.split()
	ind5 = x.index('type5')
	score = x[ind5 + 2]
	j = x.index('avg')
	score = x[ind5 + 2]
	precision = x[j + 3]
	if (float(precision) > 0 and float(score) > 0):
		return 0,0,(float(score)+float(precision))*50, "Accepted" 
	else:
		return 0,0,0, "Wrong Answer"
def optimal_features_scores(model, n):
    
    model.fit(X_train, y_train)
    
    #Get top n features
    features = pd.DataFrame({'feature':X_train.columns.values, 'importance':model.feature_importances_})
    features_sorted = features.sort_values(by = ['importance'], ascending = False)
    
    #Dataset with only top n features
    important_features = features_sorted['feature'].head(n)
    X_train_feat = X_train.loc[:, important_features]
    X_test_feat = X_test.loc[:, important_features]
    
    model.fit(X_train_feat, y_train)
    
    y_predict = model.predict(X_test_feat)
    
    acc = accuracy_score(y_test, y_predict)
    conf_tree = pd.DataFrame(confusion_matrix(y_test, y_predict),
        columns=['Predicted Benign', 'Predicted Malignant'],
        index=['True Benign', 'True Malignant'])
    print(conf_tree, "\n")
    print("Accuracy: ", acc)
    print("\n")
    #Other metrics to show model quality
    target_names = ['Benign', 'Malignant']
    print(cr(y_test, y_predict, target_names=target_names))
Example #12
0
def predictResult(x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = data2[cols2]
    fts2 = Normalizer().fit_transform(fts2)

    randomForest.fit(x_train, y_train)

    dump(randomForest, 'randomForest.model')

    randomForestLoaded = load('randomForest.model')
    prFit = randomForestLoaded.predict(x_test)
    print("predicao:", prFit)
    print("Matriz de Confusao LR:")
    print(cfm(y_test, prFit))
    print("F1 score LR:")
    print(f1s(y_test, prFit))
    print("Precision score LR:")
    print(ps(y_test, prFit))
    print("Recall score LR:")
    print(rs(y_test, prFit))
    print("Classification Report")
    print(cr(y_test, prFit))

    pr1 = randomForestLoaded.predict(fts2)
    print("predico unica", pr1)
    return pr1
Example #13
0
def lrw():
	lw(str(clfr) + '\n')
	lw(cr(y_test, y_))
	lw('\n\n')
	lw(str(cm(y_test, y_)))
	lw('\n\n')
	log.close()
	log = open(log_file, "a")
Example #14
0
def predictResult(x_train, y_train, y_test, x_test):
    data2 = pd.read_csv("/tmp/predict_result.csv", header=0)
    # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array
    cols2 = data2.columns[(data2.columns != columnResultName)]
    fts2 = data2[cols2]
    fts2 = Normalizer().fit_transform(fts2)

    scores = cross_val_score(logisticR, x_train, y_train, n_jobs=30)
    print("scores cross val")
    print(scores)

    logisticR.fit(x_train, y_train)
    dump(logisticR, 'logistic.model')

    logisticLoaded = load('logistic.model')

    prFit = logisticLoaded.predict(x_test)
    print("predicao:", prFit)
    print("Matriz de Confusao LR:")
    print(cfm(y_test, prFit))
    print("F1 score LR:")
    print(f1s(y_test, prFit))
    print("Precision score LR:")
    print(ps(y_test, prFit))
    print("Recall score LR:")
    print(rs(y_test, prFit))
    print("Classification Report")
    print(cr(y_test, prFit))
    print("Accuracy score")
    print(asc(y_test, prFit))

    class_names = [0, 1]  # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cfm(y_test, prFit)),
                annot=True,
                cmap="YlGnBu",
                fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    y_pred_proba = logisticLoaded.predict_proba(x_test)[::, 1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label="data 1, auc=" + str(auc))
    plt.legend(loc=4)
    plt.show()

    pr1 = logisticLoaded.predict(fts2)
    print("predico unica", pr1)
    return pr1
Example #15
0
def describe_video_data(video_data, answer, n=3):
    video_data = add_system_labels(video_data, answer)
    for v_id in video_data:
        a = video_data[v_id]["mturk"]
        video_data[v_id]["mturk"] = find_most_common(a, n=n)
    df_v = pd.DataFrame.from_dict(video_data, orient="index")
    r = {}
    r["Cohen's kappa (mturk and citizen)"] = cks(df_v["mturk"],
                                                 df_v["citizen"])
    r["Cohen's kappa (mturk and researcher)"] = cks(df_v["mturk"],
                                                    df_v["researcher"])
    r["Cohen's kappa (citizen and researcher)"] = cks(df_v["citizen"],
                                                      df_v["researcher"])
    r["Citizen data performance"] = cr(df_v["researcher"],
                                       df_v["citizen"],
                                       output_dict=True)
    r["MTurk data performance"] = cr(df_v["researcher"],
                                     df_v["mturk"],
                                     output_dict=True)
    return r
Example #16
0
def logistic_regression(X, X1, Y, Y1):
    global accuracy_models
    lr = LogisticRegression(multi_class='auto', solver='liblinear')
    lr.fit(X, Y)
    pred = lr.predict(X1)
    print(accuracy_score(Y1, pred))
    accuracy_models.append(accuracy_score(Y1, pred))
    cd = confusion_matrix(Y1, pred, range(len(Y1.unique())))
    print(cd)
    print(cr(Y_test, pred))
    return lr
Example #17
0
def trainModel(classifier, lTrX, lTrY, lTeX, lTeY, is_neural_net=False):
    #Fit the training dataset on the classifier
    classifier.fit(lTrX, lTrY)

    #Predict the labels on validation dataset
    lPreds = classifier.predict(lTeX)

    if is_neural_net:
        lPreds = lPreds.argmax(axis=-1)
    else:
        lNames = list(map(str, dictIdToLab.values()))
        print(cr(lTeY, lPreds, target_names=lNames))

    return accuracy_score(lPreds, lTeY)
def classifier(file_name):
    review_sparse_vect, rating_sparse_vect = bag_of_words(file_name)
    # support vector classifier one vs all
    clf = SVC(C=1, kernel='linear', gamma=1, verbose=False, probability=False,
              decision_function_shape='ovr')
    clf.fit(review_sparse_vect, rating_sparse_vect)
    # Model fitting completeion
    # print("Fitting completed")
    predicted = cv.cross_val_predict(clf, review_sparse_vect,
                                     rating_sparse_vect, cv=10)
    # calculation of metrics
    print("accuracy_score\t", acc_score(rating_sparse_vect, predicted))
    print("precision_score\t", pre_score(rating_sparse_vect, predicted))
    print("recall_score\t", rc_score(rating_sparse_vect, predicted))
    print("\nclassification_report:\n\n", cr(rating_sparse_vect, predicted))
    print("\nconfusion_matrix:\n", cm(rating_sparse_vect, predicted))
Example #19
0
def random_forest_classifier(X, X1, Y, Y1):
    global accuracy_models
    rfc = RandomForestClassifier(
        criterion='gini',
        n_estimators=200,
        random_state=0,
        max_leaf_nodes=1000,
    )
    rfc.fit(X, Y)
    pred = rfc.predict(X1)
    print(accuracy_score(Y1, pred))
    accuracy_models.append(accuracy_score(Y1, pred))
    cd = confusion_matrix(Y1, pred, range(len(Y1.unique())))
    print(cd)
    print(cr(Y_test, pred))
    return rfc
Example #20
0
def classification_report(y_true, y_pred, **kwargs):
    """ Classification report for sequence labeling.

    Parameters
    ----------
    y_true: 2D np.array or list of lists
        The true/gold sequence labels.

    y_pred: 2D np.array of list of lists
        The prediction labels

    **kwargs:
        The parameters of the sklearn.metrics.classification_report function.
    """
    y_gold, y_hat = flatten(y_true, y_pred)
    return cr(y_gold, y_hat, **kwargs)
Example #21
0
def main():

    #-- get the data --#
    dfTrain = pd.read_csv('LabelledData (1).txt',
                          sep=' ,,, ',
                          header=None,
                          engine='python')
    dfTrain.columns = ['ques', 'type']
    trainData, trainLabels = dfTrain.ques.to_list(), dfTrain.type.to_list(
    )  #--the train data.
    """
    #dfTest = pd.read_csv('train_1000.label', sep='\s+', header=None)
    #dfTest = pd.read_csv('LabelledData (1).txt', sep=' ,,, ', header=None, engine='python')
    
    f = open('train_1000.label', 'r', errors='ignore').read().split('\n')
    testData, testLabels, testType = [], [], []
    for line in f[:len(f)-1]:
        short = line[:20]
        tlabel = short.split(':')[0]
        testLabels.append(tlabel)
        ttype = short.split(' ')[0].split(':')[1]
        testType.append(ttype)
        tdata = line[len(tlabel):]
        testData.append(tdata)
    #print(testLabels); print(testType)"""

    cut = 500
    trainDataOld, trainLabelsOld = trainData, trainLabels
    trainData = trainDataOld[:cut]  #int(len(trainData)/2)
    trainLabels = trainLabelsOld[:cut]
    testData = trainDataOld[cut:]
    testLabels = trainLabelsOld[cut:]

    #-- vectorizing and training --#
    vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
    trainVectors = vectorizer.fit_transform(trainData)
    testVectors = vectorizer.transform(testData)

    #-- performing the classification --#
    model = svm.SVC(kernel='linear')
    model.fit(trainVectors, trainLabels)
    prediction = model.predict(testVectors)

    print(cr(testLabels, prediction))
Example #22
0
	def benchmark(self, clf, X_train, y_train, X_test, y_test):
		output(80 * '_')

		# fit
		output("Training:")
		t0 = time()
		clf.fit(X_train, y_train)
		train_time = time() - t0
		output("train time: %0.3fs" % train_time)

		# predict
		t0 = time()
		pred = clf.predict(X_test)
		try:
			proba = clf.predict_proba(X_test)
		except:
			proba = None
		try:
			log_proba = clf.predict_log_proba(X_test)
		except:
			log_proba = None
		test_time = time() - t0
		output("test time:  %0.3fs" % test_time)

		# get metrics for the positve class only (heavy class imbalance)
		# p_score = mlu.get_pos_precision(cm(y_test, pred))
		# r_score = mlu.get_pos_recall(cm(y_test, pred))
		# f_measure = mlu.get_f_measure(p_score, r_score)

		# get metrics
		p_scores, r_scores, f_measures, support = get_scores(y_test, pred, self.beta)
		p_score_avg = p_scores.mean()
		r_score_avg = r_scores.mean()
		f_measure_avg = f_measures.mean()
		output("precision:  %0.3f \trecall:  %0.3f" % (p_score_avg, r_score_avg))

		# output results
		output("Classification results:")
		output(cr(y_test, pred))
		output(cm(y_test, pred))

		clf_descr = str(clf).split('(')[0] # get the name of the classifier from its repr()

		return clf_descr, p_score_avg, r_score_avg, f_measure_avg, train_time, test_time, proba
Example #23
0
def k_nearest_neighbors(X, X1, Y, Y1):
    global accuracy_models
    nn = range(3, 11)
    # Empty list that will hold cv scores
    cv_scores = []
    # Perform 5-fold cross validation
    # ---------------------------------
    for k in nn:
        knn = sk.neighbors.KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X, Y, cv=5, scoring='accuracy')
        cv_scores.append(scores.mean())
    optimal_k = nn[cv_scores.index(max(cv_scores))]
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=optimal_k)
    knn.fit(X, Y)
    pred = knn.predict(X1)
    print(accuracy_score(Y1, pred))
    accuracy_models.append(accuracy_score(Y1, pred))
    cd = confusion_matrix(Y1, pred, range(len(Y1.unique())))
    print(cd)
    print(cr(Y_test, pred))
    return knn
Example #24
0
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    y_tot = np.array([])
    pred_tot = np.array([])
    model.eval()

    with torch.no_grad():

        for batch in iterator:
            text = batch.text[0]

            predictions = model(text)
            # 	  predictions=predictions.reshape([predictions.shape[0]])
            target = batch.label

            #         target = torch.autograd.Variable(target).long()
            target = target.reshape([target.shape[0], 1])
            loss = criterion(predictions, target)

            acc, f1, y_mini, pred_mini = binary_accuracy(predictions, target)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1
            y_tot = np.concatenate([y_tot, y_mini.flatten()])
            pred_tot = np.concatenate([pred_tot, pred_mini.flatten()])
    f1 = f1_score(y_tot, pred_tot, average='binary')
    f1_macro = f1_score(y_tot, pred_tot, average='macro')
    precision = precision_score(y_tot, pred_tot, average='binary')
    print(len(y_tot))
    print(cr(y_tot, pred_tot))
    print(cm(y_tot, pred_tot))
    return epoch_loss / len(iterator), epoch_acc / len(
        iterator), epoch_f1 / len(iterator), f1, f1_macro, precision
Example #25
0
cf = confusion_matrix(pred_y,test_y,labels)
print(cf)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_x, test_y)))

# confusion matrix with details
# -----------------------------
ty=list(test_y)
py=list(pred_y)
cm1=ConfusionMatrix(py,ty)
print(cm1)
cm1.print_stats()
cm1.plot()

# Classification report : precision, recall, F-score
# ---------------------------------------------------
print(cr(test_y, pred_y))


#model number 2

# RFE (recursive feature elimination)
# -----------------------------------
logreg = LogisticRegression()

# sklearn.feature_selection.RFE
# (estimator, n_features_to_select=None, step=1, verbose=0)
# get the best 18 features
rfe = RFE(logreg, 150)
rfe = rfe.fit(data[X], data[Y] )
support = rfe.support_
ranking = rfe.ranking_
def start_split_data(data_list):
    random_list = dc(data_list)
    random.shuffle(random_list)
    predicted_list = []
    mark = 0
    acc_list = []
    act_class_list = []
    for i in range(10):  # fold range
        test_list = []
        training_list = []
        while (mark < int(len(random_list))):
            for train_ele in range(0, mark):
                training_list.append(random_list[train_ele])
            else:
                index = mark
                mark = int(len(random_list) / 10) + index
                for test_element in range(index, mark):
                    test_list.append(random_list[test_element])
                for training_element in range(mark, int(len(random_list))):
                    training_list.append(random_list[training_element])
                    # print(training_list)
                    # fold completion
                Node.children = []
                Node.leaf_children = []
                Node.temp_children = []
                Node.new_children = []
                Node.len_training_list = len(training_list)
                Node.old_pessi_err = (node_err_cal(training_list, max_class(
                    training_list, class_column), class_column) + 1) / \
                                     Node.len_training_list
                root = Node(training_list)
                # print(root.data)
                root.node_type = 'root'
                build_tree(root)
                predicted_temp_list = []
                actual_list = []
                temp_root = dc(root)
                for test_element in test_list:
                    actual_list.append(int(test_element[class_column]))
                    found = int(class_finder(test_element, temp_root))
                    predicted_temp_list.append(found)
                    predicted_list.append(found)
                acc_list.append(
                    accuracy(actual_list, predicted_temp_list, class_column))
                break
    print(mean(acc_list))
    act_class_list = class_list_gen(random_list)
    # print(len(act_class_list),len(predicted_list))
    while (len(act_class_list) > len(predicted_list)):
        del act_class_list[-1]
    c_matrix = cm(act_class_list, predicted_list)
    print('Confusion matrix\n', c_matrix)
    c_report = cr(act_class_list, predicted_list)
    print("All Measures required for this data set \n", c_report)
    fpr, tpr, thd = rc(act_class_list, predicted_list)
    roc_auc = auc(fpr, tpr)
    if formula_input == 2:
        plt.title('ROC for %s with information gain(red) and gini(blue)'
                  % file_name[0])
        plt.plot(fpr, tpr,
                 label='%s  AUC = %0.2f' % (formula_measure, roc_auc))
        plt.legend(loc='lower right')
    else:
        plt.title('ROC for %s ' % file_name[0])
        plt.plot(fpr, tpr, label='%s  AUC = %0.2f' % (formula_measure,
                                                      roc_auc))
        plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
        plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([-0.1, 1.2])
    plt.ylim([-0.1, 1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
c1 = len(p1[p1 <= 0.5])
c1
c2 = len(p1[p1 > 0.5])
c2
print("<=0.5  {} , >0.5   {}  ".format(c1, c2))

predy = p1.copy(deep=True)

predy[predy <= 0.5] = 0
predy[predy > 0.5] = 1
predy.value_counts()

#confusion matrix
ConfusionMatrix(testy, predy)
print(cr(testy, predy))

#roc
from sklearn import metrics
fpr, tpr, threshold = metrics.roc_curve(testy, predy)
#auc
roc_auc = metrics.auc(fpr, tpr)
#plot
plt.title('Receiver Operating Characterstics')
plt.plot(fpr, tpr, 'b', label='AUC=%0.2f' % roc_auc)
plt.legend(loc='Lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
for i in range(0, length):
    if y_results[i] <= 0.5:
        y_results[i] = 0
    else:
        y_results[i] = 1

# accuracy score
print(accuracy_score(test_y, y_results) * 100)

# confusion matrix
cm = ConfusionMatrix(list(y_results), list(test_y))
print(cm)
cm.print_stats()

# Classification report : precision, recall, F-score
print(cr(test_y, y_results))

# draw the ROC curve
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, threshold = metrics.roc_curve(test_y, y_results)
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
Example #29
0
                tags.extend([list2])
                list1 = []
                list2 = []
                i = 0
if __name__ == '__main__':

    words = []  #matrix of words each row containing sentence
    tags = []   # matrix of tags corresponding to each word

    process_data(sys.argv[1],words,tags)
    W2V = Build_W2V(words,tags)
    #WV.TrainModel()

    data_input = W2V.wordvec
    data_output = W2V.lisvec
    xtrain = data_input[:9000]
    ytrain = data_output[:9000]
    xtest = data_input[9000:]
    ytest = data_output[9000:]
    #X,y1 = collect_data(file_tra)
    #print(X.shape)
    clf = MLPClassifier(n_hidden=50, learning_rate=0.01, SGD=True)
    clf.fit(xtrain, ytrain, max_epochs=200)
    #D,O = collect_data(file_test)
    pred = clf.predict(xtest)
    #print("O: ",O)
    #print("pred: ",pred)
    print("0: ",cr(ytest, pred))
                                                                                                                             319,1         Bot

Example #30
0
@author: Mukul
"""

import os
os.chdir("C:/Users/Mukul/Documents")
import pandas as pd
import numpy as np
tel_data = pd.read_csv("Telecom_Data.csv")

#set depe indep var
tel_data.columns
x = tel_data.drop(["phone number", "churn"], axis=1)
y = tel_data[["churn"]]
x = pd.get_dummies(x)

#divide train test
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

#random forest model
from sklearn.ensemble import RandomForestClassifier as rf
model = rf()
model.fit(xtrain, ytrain)
#apply to testset
pred_churn = model.predict(xtest)

from sklearn.metrics import classification_report as cr
cr(ytest, pred_churn)

#validation
Example #31
0
    partial_train_data, validation_data, test_data)

############################### SELECT CNN MODEL ##############################
select_model = 0
while select_model < 1 or select_model > 2:
    select_model = int(input("Select CNN Model [1-2]: "))

if select_model == 1:
    model_1, result_1 = optimize(cnn_model_one())  # Train CNN Model 1

    prediction_1 = model_1.predict(test_data)
    print("Evaluate Test")
    model_1.evaluate(test_data, test_label)
    print(
        cr(test_label.argmax(axis=1),
           prediction_1.argmax(axis=1),
           target_names=label_names))  # Classification Report

    # Training and Validation Curves
    training_and_validation_accuracy(result_1)
    training_and_validation_loss(result_1)

    # Confusion Matrix Visualization
    prediction_class_1 = np.argmax(
        prediction_1, axis=1)  # Convert predictions classes to one hot vectors
    test_label_cfm = np.argmax(
        test_label,
        axis=1)  # Convert validation observations to one hot vectors
    confusion_mtx = cfm(test_label_cfm,
                        prediction_class_1)  # Compute the confusion matrix
    plot_confusion_matrix(confusion_mtx,
Example #32
0
#
#plt.figure(figsize=(11,6))
#sb.countplot(x="purpose", hue="not.fully.paid", data=df, palette="Set1")
#

#df = df.drop(["purpose"],axis=1)

c = np.arange(0, 19, dtype=int)
c = np.delete(c, 12)

X = final_data.iloc[:, c].values
y = final_data.iloc[:, 12].values

X = final_data.drop("not.fully.paid", axis=1)
y = final_data["not.fully.paid"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=256, n_jobs=-1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report as cr
classification_report = cr(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
Example #33
0
		l = line.split("\t")
		if not l[0] == '\n':
			s = str(l[1])
			s = s[0:len(s)-1]
			out.append(s)
label=pd.Series(ans).unique()
print "\nlabels:" + str(label)
prf = pr(ans,out,labels=label,beta=1,average='weighted')
print "\nPresicion:" + str(prf[0])
print "\nRecall:" + str(prf[1])
print "\nF Score:" + str(prf[2])
acp = acc(ans,out,True)
act = acc(ans,out,False)
acp = acp*100
print "\nAccuracy:"+str(acp)+"%      "+str(act)+"/"+str(len(ans))
report = cr(ans,out,label)
print str(report)
prf = pr(ans,out,labels=label,beta=1,average=None)
sc = pd.DataFrame(index=['Precision','Recall','F Score','Support'],columns=label)
sc[:]=prf[:]
sc=pd.DataFrame.transpose(sc)
print "\n\n"
print str(sc)
arr = cm(ans,out,label)
mat = pd.DataFrame(index=label,columns=label)
mat[:]=arr[:]
print "\n\n"
print u"\u2193"+"Actual/Predicted-->"
print (str(mat))
fA.close()
fO.close()
    elif r == 3:
        print("PCW")
    elif r == 4:
        print("Stoppage")


res(10, 8.180509567, 77.418396)

# In[50]:

#Evaluation

from sklearn.metrics import classification_report as cr, confusion_matrix as cm, accuracy_score as acc_s

print("Confusion Matrix : \n", cm(Y_test, Y_pred))
print("\n\nClassification report : \n", cr(Y_test, Y_pred))
print("\n\nAccuracy : ", acc_s(Y_test, Y_pred) * 100)

# In[51]:

res(3, 8.178689957, 77.42429352)

# In[18]:

#Finding suitable k

e = []

for i in range(1, 50):
    knn = KNC(n_neighbors=i)
    knn.fit(X_train, Y_train)
def main(train_file, test_file, load_method="csv", opti_method=None, maxiter=100, 
		 batch_size=-1, units=None, lmbda=0, alpha=100, beta=1000):
	"""
	Manages files and operations for the neural network model creation, training, and testing.
	@parameters: 
		load_method - the dataset file format, either "csv" or "hdf"
		opti_method - specifies the optimization method to use, "l-bfgs", "cg", or
					   None (defaults to SGD)
		maxiter - the maximum number of iterations allowed for training
		batch_size - the number of instance for each mini-batch, -1 implies batch processing
		units - a sequence of integers separated by '.' such that each integer represents 
				 the number of units in a sequence of hidden layers.
		lmbda - the regularization term
		alpha - the numerator for the learning rate schedule (relevant for SGD only)
		beta - the denominator for the learning rate schedule (relevant for SGD only)
	"""
	# open and load csv files
	if load_method == "csv":
		X_train, y_train = mlu.load_csv(train_file, True) # load and shuffle training set
		X_test, y_test = mlu.load_csv(test_file)
	elif load_method == "hdf":
		X_train, y_train = mlu.loadh(train_file, True) # load and shuffle training set
		X_test, y_test = mlu.loadh(test_file)
	else:
		raise Exception("Dataset file type not recognized: acceptable formats are 'csv' and 'hfd'.")

	# perform feature scaling
	X_train = mlu.scale_features(X_train, 0.0, 1.0)
	X_test = mlu.scale_features(X_test, 0.0, 1.0)

	# create the neural network classifier using the training data
	NNC = NeuralNetClassifier(opti_method, maxiter, batch_size, units, lmbda, alpha, beta)
	print "\nCreated a neural network classifier\n\t", NNC

	# fit the model to the loaded training data
	print "\nFitting the training data..."
	# costs, mags = NNC.fit(X_train, y_train)
	NNC.fit(X_train, y_train)

	# predict the results for the test data
	print "\nGenerating probability prediction for the test data..."
	y_pred = NNC.predict(X_test)

	### output classification results ###
	# output class prediction probability for each instance in the test set
	print "\nThe probabilities for each instance in the test set are:\n"
	for prob in NNC.predict_proba(X_test):
		print prob
	# output accuracy
	print 'Accuracy: ', mlu.compute_accuracy(y_test, y_pred)

	# output sklearn style results if the module is availble
	try:
		from sklearn.metrics import classification_report as cr
		from sklearn.metrics import confusion_matrix as cm
		print
		print "Classification results:"
		print cr(y_test, y_pred)
		print cm(y_test, y_pred)
	except:
		pass

	# save model parameters as a pickle
	NNC.save_model("NNCModel.p")
Example #36
0
def classify(model, featureVectors):
    z = model.predict(featureVectors[:, :-1]).astype(np.int).reshape(-1).tolist()
    data = featureVectors[:, -1].flatten()
    data = data.astype(np.int).tolist()
    print cr(data, z, target_names=["DOS", "Normal", "Probing", "R2L", "U2R"], digits=4)
Example #37
0
    def fit_worker(self, rank, world_size, p_model, save_model_path, save_tensorboard_path, save_log_path,
            p_frame, p_metadata_train, p_metadata_validation, p_metadata_test):
        # Set logger
        save_log_path += str(rank)
        self.create_logger(log_path=save_log_path)
        self.log("="*60)
        self.log("="*60)
        self.log("Use Two-Stream Inflated 3D ConvNet learner")
        self.log("save_model_path: " + save_model_path)
        self.log("save_tensorboard_path: " + save_tensorboard_path)
        self.log("save_log_path: " + save_log_path)
        self.log("p_metadata_train: " + p_metadata_train)
        self.log("p_metadata_validation: " + p_metadata_validation)
        self.log("p_metadata_test: " + p_metadata_test)
        self.log_parameters()

        # Set model
        model = self.set_model(rank, world_size, self.mode, p_model, self.can_parallel, phase="train")
        if model is None: return None

        # Load datasets
        metadata_path = {"train": p_metadata_train, "validation": p_metadata_validation}
        ts = self.get_transform(self.mode, image_size=self.image_size)
        transform = {"train": ts, "validation": ts}
        if self.augment:
            transform["train"] = self.get_transform(self.mode, phase="train", image_size=self.image_size)
        dataloader = self.set_dataloader(rank, world_size, metadata_path, p_frame,
                transform, self.batch_size_train, self.can_parallel)

        # Create tensorboard writter
        writer_t = SummaryWriter(save_tensorboard_path + "/train/")
        writer_v = SummaryWriter(save_tensorboard_path + "/validation/")

        # Set optimizer
        optimizer = optim.SGD(model.parameters(), lr=self.init_lr, momentum=self.momentum, weight_decay=self.weight_decay)
        lr_sche= optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.milestones, gamma=self.gamma)

        # Set logging format
        log_fm = "%s step: %d lr: %r loc_loss: %.4f cls_loss: %.4f loss: %.4f"

        # Train and validate
        steps = 0
        epochs = 0
        nspu = self.num_steps_per_update
        nspc = self.num_steps_per_check
        nspu_nspc = nspu * nspc
        accum = {} # counter for accumulating gradients
        tot_loss = {} # total loss
        tot_loc_loss = {} # total localization loss
        tot_cls_loss = {} # total classification loss
        pred_labels = {} # predicted labels
        true_labels = {} # true labels
        for phase in ["train", "validation"]:
            accum[phase] = 0
            tot_loss[phase] = 0.0
            tot_loc_loss[phase] = 0.0
            tot_cls_loss[phase] = 0.0
            pred_labels[phase] = []
            true_labels[phase] = []
        while steps < self.max_steps:
            # Each epoch has a training and validation phase
            for phase in ["train", "validation"]:
                self.log("-"*40)
                self.log("phase " + phase)
                if phase == "train":
                    epochs += 1
                    self.log("epochs: %d steps: %d/%d" % (epochs, steps, self.max_steps))
                    model.train(True) # set model to training mode
                    for param in model.parameters():
                        param.requires_grad = True
                else:
                    model.train(False) # set model to evaluate mode
                    for param in model.parameters():
                        param.requires_grad = False
                optimizer.zero_grad()
                # Iterate over batch data
                for d in tqdm.tqdm(dataloader[phase]):
                    if self.code_testing:
                        if phase == "train" and steps >= self.max_steps: break
                        if phase == "validation" and accum[phase] >= self.max_steps: break
                    accum[phase] += 1
                    # Get prediction
                    frames = self.to_variable(d["frames"])
                    labels = d["labels"]
                    true_labels[phase] += self.labels_to_list(labels)
                    labels = self.to_variable(labels)
                    pred = self.make_pred(model, frames)
                    pred_labels[phase] += self.labels_to_list(pred.cpu().detach())
                    # Compute localization loss
                    loc_loss = F.binary_cross_entropy_with_logits(pred, labels)
                    tot_loc_loss[phase] += loc_loss.data
                    # Compute classification loss (with max-pooling along time, batch x channel x time)
                    cls_loss = F.binary_cross_entropy_with_logits(torch.max(pred, dim=2)[0], torch.max(labels, dim=2)[0])
                    tot_cls_loss[phase] += cls_loss.data
                    # Backprop
                    loss = (0.5*loc_loss + 0.5*cls_loss) / nspu
                    tot_loss[phase] += loss.data
                    if phase == "train":
                        loss.backward()
                    # Accumulate gradients during training
                    if (accum[phase] == nspu) and phase == "train":
                        steps += 1
                        if steps % nspc == 0:
                            # Log learning rate and loss
                            lr = lr_sche.get_lr()[0]
                            tll = tot_loc_loss[phase]/nspu_nspc
                            tcl = tot_cls_loss[phase]/nspu_nspc
                            tl = tot_loss[phase]/nspc
                            self.log(log_fm % (phase, steps, lr, tll, tcl, tl))
                            # Add to tensorboard
                            if rank == 0:
                                writer_t.add_scalar("localization_loss", tll, global_step=steps)
                                writer_t.add_scalar("classification_loss", tcl, global_step=steps)
                                writer_t.add_scalar("loss", tl, global_step=steps)
                                writer_t.add_scalar("learning_rate", lr, global_step=steps)
                            # Reset loss
                            tot_loss[phase] = tot_loc_loss[phase] = tot_cls_loss[phase] = 0.0
                        # Reset gradient accumulation
                        accum[phase] = 0
                        # Update learning rate and optimizer
                        optimizer.step()
                        optimizer.zero_grad()
                        lr_sche.step()
                    # END FOR LOOP
                if phase == "validation":
                    # Log learning rate and loss
                    lr = lr_sche.get_lr()[0]
                    tll = tot_loc_loss[phase]/accum[phase]
                    tcl = tot_cls_loss[phase]/accum[phase]
                    tl = (tot_loss[phase]*nspu)/accum[phase]
                    # Sync losses for validation set
                    if self.can_parallel:
                        tll_tcl_tl = torch.Tensor([tll, tcl, tl]).cuda()
                        dist.all_reduce(tll_tcl_tl, op=dist.ReduceOp.SUM)
                        tll = tll_tcl_tl[0].item() / world_size
                        tcl = tll_tcl_tl[1].item() / world_size
                        tl = tll_tcl_tl[2].item() / world_size
                    self.log(log_fm % (phase, steps, lr, tll, tcl, tl))
                    # Add to tensorboard and save model
                    if rank == 0:
                        writer_v.add_scalar("localization_loss", tll, global_step=steps)
                        writer_v.add_scalar("classification_loss", tcl, global_step=steps)
                        writer_v.add_scalar("loss", tl, global_step=steps)
                        writer_v.add_scalar("learning_rate", lr, global_step=steps)
                        self.save(model, save_model_path + str(steps) + ".pt")
                    # Reset loss
                    tot_loss[phase] = tot_loc_loss[phase] = tot_cls_loss[phase] = 0.0
                    # Reset gradient accumulation
                    accum[phase] = 0
                    # Save precision, recall, and f-score to the log and tensorboard
                    for ps in ["train", "validation"]:
                        # Sync true_labels and pred_labels for validation set
                        if self.can_parallel and ps == "validation":
                            true_pred_labels = torch.Tensor([true_labels[ps], pred_labels[ps]]).cuda()
                            true_pred_labels_list = [torch.ones_like(true_pred_labels) for _ in range(world_size)]
                            dist.all_gather(true_pred_labels_list, true_pred_labels)
                            true_pred_labels = torch.cat(true_pred_labels_list, dim=1)
                            true_labels[ps] = true_pred_labels[0].cpu().numpy()
                            pred_labels[ps] = true_pred_labels[1].cpu().numpy()
                        self.log("Evaluate performance of phase: %s\n%s" % (ps, cr(true_labels[ps], pred_labels[ps])))
                        if rank == 0:
                            result = prfs(true_labels[ps], pred_labels[ps], average="weighted")
                            writer = writer_t if ps == "train" else writer_v
                            writer.add_scalar("precision", result[0], global_step=steps)
                            writer.add_scalar("recall", result[1], global_step=steps)
                            writer.add_scalar("weighted_fscore", result[2], global_step=steps)
                        # Reset
                        pred_labels[ps] = []
                        true_labels[ps] = []

        # Clean processors
        self.clean_mp()

        self.log("Done training")
training_curve_title = 'Support vector classifier'
train_val_split_folds = 5
train_sizes = np.linspace(0.04, 1.0, 20)

#plot_learning_curve(estimator, X_train_val, np.ravel(y_train_val), title = training_curve_title, cv=train_val_split_folds,train_sizes = train_sizes)
#plt.show()

#Plot a cross-validation curve
CV_curve_title = 'Support vector classifier'
CV_param_name = 'C'
#CV_param_name = 'gamma'
CV_pararam_range = np.array([0.001, 0.01, 0.1, 1, 10, 100])

#plot_validation_curve(estimator, X_train_val, np.ravel(y_train_val), CV_param_name, CV_pararam_range, title = CV_curve_title, xlabel = 'Parameter', ylabel = 'Score')
#plt.show()

#VALIDATION
#Display the error metrics on the training data
class_names = ['Diseased', 'Survived']
class_rep_train = cr(y_train, y_pred_train_self, target_names=class_names)
print("Performance on training data:")
print(class_rep_train)

#Compare the predictions with the real values
class_rep_val = cr(y_val, y_pred_train, target_names=class_names)
print("Performance on validation data:")
print(class_rep_val)

#Generate predictions from the test set
#y_pred_test = estimator.predict(X_test)
Example #39
0
    def test_worker(self, rank, world_size, p_model, save_log_path, p_frame, save_viz_path, p_metadata_test):
        # Set logger
        save_log_path += str(rank)
        self.create_logger(log_path=save_log_path)
        self.log("="*60)
        self.log("="*60)
        self.log("Use Two-Stream Inflated 3D ConvNet learner")
        self.log("Start testing with mode: " + self.mode)
        self.log("save_log_path: " + save_log_path)
        self.log("save_viz_path: " + save_viz_path)
        self.log("p_metadata_test: " + p_metadata_test)
        self.log_parameters()

        # Set model
        model = self.set_model(rank, world_size, self.mode, p_model, self.can_parallel, phase="test")
        if model is None: return None

        # Load dataset
        metadata_path = {"test": p_metadata_test}
        transform = {"test": self.get_transform(self.mode, image_size=self.image_size)}
        dataloader = self.set_dataloader(rank, world_size, metadata_path, p_frame,
                transform, self.batch_size_test, self.can_parallel)

        # Test
        model.train(False) # set the model to evaluation mode
        file_name = []
        true_labels = []
        pred_labels = []
        true_scores = []
        pred_scores = []
        counter = 0
        with torch.no_grad():
            # Iterate over batch data
            for d in dataloader["test"]:
                if counter % 5 == 0:
                    self.log("Process batch " + str(counter))
                counter += 1
                file_name += d["file_name"]
                frames = self.to_variable(d["frames"])
                labels = d["labels"]
                true_labels += self.labels_to_list(labels)
                true_scores += self.labels_to_score_list(labels)
                labels = self.to_variable(labels)
                pred = self.make_pred(model, frames)
                pred = pred.cpu().detach()
                pred_labels += self.labels_to_list(pred)
                pred_scores += self.labels_to_score_list(pred)

        # Sync true_labels and pred_labels for testing set
        true_labels_all = np.array(true_labels)
        pred_labels_all = np.array(pred_labels)
        true_scores_all = np.array(true_scores)
        pred_scores_all = np.array(pred_scores)

        if self.can_parallel:
            true_pred_labels = torch.Tensor([true_labels, pred_labels, true_scores, pred_scores]).cuda()
            true_pred_labels_list = [torch.ones_like(true_pred_labels) for _ in range(world_size)]
            dist.all_gather(true_pred_labels_list, true_pred_labels)
            true_pred_labels = torch.cat(true_pred_labels_list, dim=1)
            true_labels_all = true_pred_labels[0].cpu().numpy()
            pred_labels_all = true_pred_labels[1].cpu().numpy()
            true_scores_all = true_pred_labels[2].cpu().numpy()
            pred_scores_all = true_pred_labels[3].cpu().numpy()

        # Save precision, recall, and f-score to the log
        self.log("Evaluate performance of phase: test\n%s" % (cr(true_labels_all, pred_labels_all)))

        # Save roc curve and score
        self.log("roc_auc_score: %s" % str(roc_auc_score(true_scores_all, pred_scores_all, average=None)))

        # Generate video summary and show class activation map
        # TODO: this part will cause an error when using multiple GPUs
        try:
            # Video summary
            cm = confusion_matrix_of_samples(true_labels, pred_labels, n=64)
            write_video_summary(cm, file_name, p_frame, save_viz_path + str(rank) + "/")
            # Save confusion matrix
            cm_all = confusion_matrix_of_samples(true_labels, pred_labels)
            for u in cm_all:
                for v in cm_all[u]:
                    for i in range(len(cm_all[u][v])):
                        idx = cm_all[u][v][i]
                        cm_all[u][v][i] = file_name[idx]
            save_json(cm_all, save_viz_path + str(rank) + "/confusion_matrix_of_samples.json")
        except Exception as ex:
            self.log(ex)

        # Clean processors
        self.clean_mp()

        self.log("Done testing")