def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    print(X_train)
    print(X_test)

    # hyper-parameters inferred by running skopt
    clf = SVC(C=447.81051228628013, coef0=0.12426850569436687, decision_function_shape="ovr",
              degree=2, gamma=0.02413100813767344, kernel="rbf", tol=0.004948161298923479, verbose=True)

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='SVM',
                                  classes=unique_labels,
                                  title='Confusion matrix for SVM evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(y_test, y_predicted, ml_name='SVM',
                                       classes=unique_labels,
                                       title='Classification report for SVM evaluation')
Beispiel #2
0
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    clf = DecisionTreeClassifier()

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(
        y_test,
        y_predicted,
        ml_name='DT',
        classes=unique_labels,
        title='Confusion matrix for Decision Tree evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(
        y_test,
        y_predicted,
        ml_name='DT',
        classes=unique_labels,
        title='Classification report for Decision Tree evaluation')
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    # hyper-parameters inferred by running auto-sklearn
    clf = GradientBoostingClassifier(learning_rate=0.0433556140045585,
        n_estimators=388, subsample=0.8291104221904706, criterion='mse',
        min_samples_split=13, min_samples_leaf=15,
        max_depth=10, max_features=0.33000096635982235, verbose=True)

    # hyper parameters inferred by running hyperopt-sklearn
    # clf = GradientBoostingClassifier(criterion="mse", learning_rate=0.28539836866041823, max_depth=9, max_features=0.3842196341383438,
    #                                  min_samples_leaf=14, min_samples_split=9, n_estimators=734, subsample=0.7421091918485163)

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='DT',
                                  classes=unique_labels,
                                  title='Confusion matrix for Decision Tree evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(y_test, y_predicted, ml_name='DT',
                                       classes=unique_labels,
                                       title='Classification report for Decision Tree evaluation')
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    clf = GaussianProcessClassifier(max_iter_predict=500, warm_start=True, n_jobs=-1)

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='GP',
                                  classes=unique_labels,
                                  title='Confusion matrix for Gaussian Process evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(y_test, y_predicted, ml_name='GP',
                                       classes=unique_labels,
                                       title='Classification report for Gaussian Process evaluation')
Beispiel #5
0
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    # hyper-parameters inferred from running hyperopt-sklearn
    clf = RandomForestClassifier(bootstrap=False,
                                 class_weight=None,
                                 criterion='entropy',
                                 max_depth=None,
                                 max_features='sqrt',
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None,
                                 min_samples_leaf=1,
                                 min_samples_split=2,
                                 min_weight_fraction_leaf=0.0,
                                 n_estimators=75,
                                 n_jobs=1,
                                 oob_score=False,
                                 random_state=1,
                                 verbose=False,
                                 warm_start=False)

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(
        y_test,
        y_predicted,
        ml_name='RF',
        classes=unique_labels,
        title='Confusion matrix for Random Forest evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(
        y_test,
        y_predicted,
        ml_name='RF',
        classes=unique_labels,
        title='Classification report for Random Forest evaluation')
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values
    unique_labels = sorted(set(y_train.tolist()))

    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)

    # hyper-parameters inferred by running hyperopt-sklearn
    clf = XGBClassifier(colsample_bylevel=0.8737745469231419,
                        colsample_bytree=1.0,
                        gamma=4.858229599937319e-07,
                        learning_rate=0.4853267733199465,
                        max_delta_step=0,
                        max_depth=9,
                        min_child_weight=0,
                        n_estimators=64,
                        reg_alpha=2.5693931492543614e-05,
                        reg_lambda=6.027978487395207e-05,
                        scale_pos_weight=73.0915750362818,
                        subsample=0.5410531887103683)

    clf.fit(X_train, y_train)

    print("\n\n{}\n".format(clf.score(X_test, y_test)))

    y_predicted = clf.predict(X_test)

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(
        y_test,
        y_predicted,
        ml_name='XG',
        classes=unique_labels,
        title='Confusion matrix for XGBoost evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(
        y_test,
        y_predicted,
        ml_name='XG',
        classes=unique_labels,
        title='Classification report for XGBoost evaluation')
Beispiel #7
0
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]
    unique_labels = sorted(y_train.unique().tolist())

    curr_pred, curr_score = None, 0

    for k in range(1, 20):
        clf = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_predicted = clf.predict(X_test)
        if score > curr_score:
            print("K = {} -- {}".format(k, score))
            curr_pred = y_predicted
            curr_score = score

    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(
        y_test,
        curr_pred,
        ml_name='Nearest_Neighbor',
        classes=unique_labels,
        title='Confusion matrix for Nearest Neighbor evaluation')

    print("Generating classification report figure... \n")
    stdfunc.plot_classification_report(
        y_test,
        curr_pred,
        ml_name='Nearest_Neighbor',
        classes=unique_labels,
        title='Classification report for Nearest Neighbor evaluation')
Beispiel #8
0
def main():

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("\nDevice used : {}".format(
        'cuda' if torch.cuda.is_available() else 'cpu'))
    print("Pytorch version: {}".format(torch.__version__))

    if torch.cuda.is_available():
        print(torch.cuda.get_device_name(0))

    project_name = "MLP"

    # hyper parameters
    num_epochs    = 30  # how many iterations for complete single dataset training
    learning_rate = 0.003
    batch_size    = 3    # batch per-training
    layer_size    = [15, 50, 100, 30, 10, 2]

    enable_checkpoint = True
    # model filename
    checkpoint_name = 'checkpoint-{}.pt'.format(project_name)

    # load dataset
    malware_train = LoadDataset(encoded_features_path='../train_dataset.csv')
    malware_test = LoadDataset(encoded_features_path='../test_dataset.csv')

    print("\nSize of training dataset: {}".format(len(malware_train)))
    print("Size of testing dataset: {}\n".format(len(malware_test)))

    # shuffle=True means for every epoch, the data is going to be re-shuffled
    # pin_memory=True, ref: https://devblogs.nvidia.com/how-optimize-data-transfers-cuda-cc/
    train_loader = torch.utils.data.DataLoader(
        malware_train, batch_size=batch_size, pin_memory=True, shuffle=True)
    test_loader = torch.utils.data.DataLoader(
        malware_test, batch_size=batch_size, pin_memory=True, shuffle=False)

    # setup appropriate objects
    mlp = MultilayerPerceptron(layer_size).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)
    epoch = 0

    # load previous checkpoint if it exists
    if enable_checkpoint and os.path.exists(checkpoint_name):
        print("Previous checkpoint model found!\n")
        checkpoint = torch.load(checkpoint_name)
        mlp.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        mlp.eval()

    while epoch < num_epochs:
        avg_loss = 0
        for i, (X, _, labels) in enumerate(train_loader):

            mlp.train()  # switch back to train mode

            X, labels = X.to(device), labels.to(device)
            outputs = mlp(X)
            loss = criterion(outputs, labels)
            avg_loss += loss.item()

            optimizer.zero_grad()  # clear our previous calc
            loss.backward()        # calc all parameters gradient
            optimizer.step()       # apply weight tuning based on calculated gradient

            if (i+1) % 30 == 0:
                mlp.eval()  # turns off dropout and batch normalization
                epoch_fmt = str(epoch).rjust(len(str(num_epochs)))
                batch_fmt = str(i+1).rjust(len(str(len(train_loader))))
                fmt_str = "Epochs [" + epoch_fmt + "/{}], Batch [" + batch_fmt + "/{}], Loss = {:.6f}"
                print(fmt_str.format(num_epochs, len(train_loader), loss.item()))

        avg_loss /= len(train_loader)
        if (epoch+1) % 5 == 0:
            print("\nAverage loss for epochs [{}] = {:.8f}\n".format(epoch+1, avg_loss))

        # test accuracy of model for every 10 epochs
        if (epoch+1) % 10 == 0:
            with torch.no_grad():
                # turns off dropout and batch normalization
                mlp.eval()
                correct_cnt, total_cnt = 0, 0
                for X, mal_hash, labels in test_loader:
                    X, labels = X.float().to(device), labels.to(device)
                    outputs = mlp(X)
                    max_accuracy, pred_label = torch.max(outputs.data, 1)
                    
                    '''
                    for each_accuracy, each_hash in zip(max_accuracy, mal_hash):
                      if each_accuracy <= 0.94:
                        print("Low accuracy {:.6f} = {}\n".format(each_accuracy, each_hash))
                    '''

                    total_cnt += X.cpu().data.size()[0]
                    correct_cnt += (pred_label == labels.data).sum()
                accuracy = correct_cnt.cpu().item() * 1.0 / total_cnt
                print("Test - Epoch {} -- Accuracy : {}\n".format(epoch+1, accuracy))

        # save model for every 10 iterations -- make sure we don't lost everything
        if enable_checkpoint:
            if (epoch+1) % 10 == 0:
                print("Saving checkpoint model..\n")
                torch.save({
                    'epoch': epoch+1,
                    'model_state_dict': mlp.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                }, checkpoint_name)

        epoch += 1

    torch.save(mlp.state_dict(), '{}-Trained-Model.pt'.format(project_name))

    mlp.eval()

    predict_list, label_list = [], []
    with torch.no_grad():
        correct_cnt, total_cnt = 0, 0
        for X, labels in test_loader:
            X, labels = X.float().to(device), labels.to(device)
            outputs = mlp(X)
            _, pred_label = torch.max(outputs.data, 1)
            predict_list.extend(pred_label.cpu().numpy().tolist())
            label_list.extend(labels.cpu().numpy().tolist())
            total_cnt += X.cpu().data.size()[0]
            correct_cnt += (pred_label == labels.data).sum()
        accuracy = correct_cnt.cpu().item() * 1.0 / total_cnt
        print("Final Accuracy = {}\n".format(accuracy))

    with open('malware-label-index.txt', 'w') as fo:
        fo.write('[' + ','.join(malware_train.unique_labels) + ']')


    print("Generating confusion matrix figure... \n")
    stdfunc.plot_confusion_matrix(label_list, predict_list, ml_name='MLP',
                                  classes=malware_train.unique_labels,
                                  title='Confusion matrix for MLP evaluation')

    print("Generating clustering report figure... \n")
    stdfunc.plot_clustering_report(label_list, predict_list, ml_name='MLP',
                                  classes=malware_train.unique_labels,
                                  title='Clustering report for MLP evaluation')