Exemple #1
0
def run(max_epoch=50, nfolds=10, batch_size=128,verbose=0):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Create feature vectors
    print("vectorizing data")
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
    count_vec = ngram_vectorizer.fit_transform(X)

    max_features = count_vec.shape[1]

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    for fold in range(nfolds):
        print("fold %u/%u" % (fold+1, nfolds))
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y,
                                                                           labels, test_size=0.2)

        print('Build model...')
        model = build_model(max_features)

        print("Train...")
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train.todense(), y_train, batch_size=batch_size, epochs=1,verbose=verbose)

            t_probs = model.predict_proba(X_holdout.todense(),verbose=verbose)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test.todense(),verbose=verbose)

                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 5:
                    break

        final_data.append(out_data)

    return final_data
Exemple #2
0
def train(max_epoch=25, batch_size=128):
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    all_chars = string.ascii_letters + string.digits + '-.'
    valid_chars = {x:idx+1 for idx, x in enumerate(all_chars)}

    print 'Build model...'
    model = build_model()

    maxlen = 256
    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.05)
    best_iter = -1
    best_auc = 0.0

    for ep in range(max_epoch):
        model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1)

        t_probs = model.predict_proba(X_holdout)
        t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

        print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

        if t_auc > best_auc:
            best_auc = t_auc
            best_iter = ep
        else:
            # No longer improving...break and calc statistics
            if (ep-best_iter) > 2:
                break

    return model
Exemple #3
0
def run(max_epoch=25, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []
    nfolds_best_model = {'best_auc': 0.0,  'best_model': None}
    for fold in range(nfolds):
        print "fold %u/%u" % (fold+1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, 
                                                                           test_size=0.2)

        print 'Build model...'
        model = build_model(max_features, maxlen)

        print "Train..."
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print '\nEpoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print '\n', sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 2:
                    break

        final_data.append(out_data)
        nfolds_best_auc = nfolds_best_model['best_auc']
        if best_auc > nfolds_best_auc:
            nfolds_best_model['best_auc'] = best_auc
            nfolds_best_model['best_model'] = model
    best_model = nfolds_best_model['best_model']
    save_model(best_model, save_path)
    return final_data
def run(max_epoch=50, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Create feature vectors
    print "vectorizing data"
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
    X = ngram_vectorizer.fit_transform(X)

    max_features = X.shape[1]

    malware_labels = data.get_malware_labels(labels)
    all_Ys = data.expand_labels(labels)

    final_data = []

    for fold in range(nfolds):
        print "fold %u/%u" % (fold+1, nfolds)
        train_test = train_test_split(X, labels, *all_Ys, test_size=0.2, stratify=labels)
        X_train, X_test, label_train, label_test, y_train, y_test = train_test[:6]
        dga_training_test = train_test[6:]

        all_Y_train = [y_train]
        for idx in range(0, len(dga_training_test), 2):
            all_Y_train.append(dga_training_test[idx])

        print 'Build model...'
        model = build_model(max_features, num_targets=len(malware_labels) + 1)

        print "Train..."
        train_test = train_test_split(X_train, *all_Y_train, test_size=0.05, stratify=label_train)
        X_train, X_holdout, y_train, y_holdout = train_test[:4]
        dga_training_test = train_test[4:]
        all_Y_train = [y_train]
        for idx in range(0, len(dga_training_test), 2):
            all_Y_train.append(dga_training_test[idx])

        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train.todense(), data.y_list_to_dict(all_Y_train), batch_size=batch_size, epochs=1)

            t_probs = model.predict(X_holdout.todense())[0]
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict(X_test.todense())[0]

                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 5:
                    break

        final_data.append(out_data)

    return final_data
Exemple #5
0
def run(max_epoch=25, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

    #max_features = len(valid_chars) + 1
    #maxlen = np.max([len(x) for x in X])

    max_features = 100
    maxlen = 256

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    for fold in range(nfolds):
        print "fold %u/%u" % (fold+1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels, 
                                                                           test_size=0.2)

        print 'Build model...'
        model = build_model(max_features, maxlen)

        print "Train..."
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 2:
                    break

        final_data.append(out_data)

    return final_data
Exemple #6
0
def run(max_epoch=15, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X, labels = zip(*indata)

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = np.asarray([0 if x == 'benign' else 1 for x in labels])

    final_data = []

    for fold in range(nfolds):
        print('fold {}/{}'.format(fold + 1, nfolds))
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            X, y, labels, test_size=0.2)

        print('Build model...')
        model = build_model(max_features, maxlen)

        print('Train...')
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = roc_auc_score(y_holdout, t_probs)

            print('Epoch {}: auc = {} (best={})'.format(ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {
                    'y': y_test,
                    'labels': label_test,
                    'probs': probs,
                    'epochs': ep,
                    'confusion_matrix': confusion_matrix(y_test, probs > .5)
                }

                print(confusion_matrix(y_test, probs > .5))
            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 2:
                    break

        final_data.append(out_data)

    return final_data
Exemple #7
0
def run(max_epoch=2, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]
    # add realdomins to the vecrization processing
    realdomains = []
    with open('realtopdomains.txt', 'r') as f:
        realdomains = [l for l in f.readlines()]

    lenofx = len(X)

    X += realdomains

    # Create feature vectors
    print "vectorizing data"
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char',
                                                               ngram_range=(2,
                                                                            2))
    count_vec = ngram_vectorizer.fit_transform(X)
    """
    # get real data's vector matrix to ensure have the same shape with model
    pickle.dump(count_vec[lenofx:, :], open('realtopdomain.pkl', 'w'))
    """

    count_vec = count_vec[:lenofx, :]

    max_features = count_vec.shape[1]

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    accuracy = 0.0
    recall = 0.0
    f1 = 0.0

    for fold in range(nfolds):
        print "fold %u/%u" % (fold + 1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            count_vec, y, labels, test_size=0.2)

        print 'Build model...'
        model = build_model(max_features)

        print "Train..."
        xi = X_train
        yi = y_train
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X_train, y_train, test_size=0.05)

        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):

            model.fit(X_train.todense(),
                      y_train,
                      batch_size=batch_size,
                      nb_epoch=1)

            t_probs = model.predict_proba(X_holdout.todense())

            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                """
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test.todense())
                pre = model.predict_classes(X_test.todense())


                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}
                """

                #recall = sklearn.metrics.recall_score(y_test, pre)
                #f1 = sklearn.metrics.f1_score(y_test, pre)

            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 5:
                    break

        pre_i = model.predict_classes(xi.todense())
        accuracy = sklearn.metrics.accuracy_score(yi, pre_i)
        print '\n accurracy: %f' % (accuracy)

        #final_data.append(out_data)
        model.save('bigramMode.h5')

    return final_data
Exemple #8
0
def run(max_epoch=50, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Create feature vectors
    print "vectorizing data"
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
    count_vec = ngram_vectorizer.fit_transform(X)

    max_features = count_vec.shape[1]

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []
    nfolds_best_model = {'best_auc': 0.0,  'best_model': None}
    for fold in range(nfolds):
        print "fold %u/%u" % (fold+1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(count_vec, y,
                                                                           labels, test_size=0.2)

        print 'Build model...'
        model = build_model(max_features)

        print "Train..."
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train.todense(), y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict_proba(X_holdout.todense())
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print '\nEpoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test.todense())

                out_data = {'y':y_test, 'labels': label_test, 'probs':probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print '\n', sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 5:
                    break

        final_data.append(out_data)
        nfolds_best_auc = nfolds_best_model['best_auc']
        if best_auc > nfolds_best_auc:
            nfolds_best_model['best_auc'] = best_auc
            nfolds_best_model['best_model'] = model
    best_model = nfolds_best_model['best_model']
    save_model(best_model, save_path)
    # print 'final_data: ', final_data 
    return final_data
Exemple #9
0
def run(max_epoch=25, nfolds=10, batch_size=128, savemodel=False):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # pickle encoding params
    pickle.dump(valid_chars, open(VALIDCHAR_FILE, 'wb'))
    pickle.dump(maxlen, open(MAXLEN_FILE, 'wb'))

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    for fold in range(nfolds):
        print("fold %u/%u" % (fold + 1, nfolds))
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            X, y, labels, test_size=0.2)

        print('Build model...')
        model = build_model(max_features, maxlen)

        print("Train...")
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {
                    'y':
                    y_test,
                    'labels':
                    label_test,
                    'probs':
                    probs,
                    'epochs':
                    ep,
                    'confusion_matrix':
                    sklearn.metrics.confusion_matrix(y_test, probs > .5)
                }

                #                print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
                #                print("{} {}".format(__name__, "breaking prematurely"))
                break
            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 2:
                    break

        final_data.append(out_data)

    if savemodel:
        model.save(MODEL_FILE)
    return final_data
Exemple #10
0
def run(max_epoch=25, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}
    with open('dict.json', 'w') as f:
        f.write(json.dumps(valid_chars))

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    tb = TensorBoard(log_dir='./logs',
                     histogram_freq=0,
                     batch_size=32,
                     write_graph=True,
                     write_grads=False,
                     write_images=False,
                     embeddings_freq=0,
                     embeddings_layer_names=None,
                     embeddings_metadata=None)

    for fold in range(nfolds):
        print "fold %u/%u" % (fold + 1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            X, y, labels, test_size=0.2)

        print 'Build model...'
        model = build_model(max_features, maxlen)

        print "Train..."
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train,
                      y_train,
                      batch_size=batch_size,
                      nb_epoch=1,
                      callbacks=[tb])

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)
            y_est = [1 if x > 0.5 else 0 for x in t_probs]
            f1 = sklearn.metrics.f1_score(y_holdout, y_est)
            print 'Epoch %d: auc = %f (best=%f) f1 = %f' % (ep, t_auc,
                                                            best_auc, f1)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {
                    'y':
                    y_test,
                    'labels':
                    label_test,
                    'probs':
                    probs,
                    'epochs':
                    ep,
                    'confusion_matrix':
                    sklearn.metrics.confusion_matrix(y_test, probs > .5)
                }

                print sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 2:
                    break

        final_data.append(out_data)

    model.save_weights('model.hdf5')
    with open('model.json', 'w') as f:
        f.write(model.to_json())

    return final_data
Exemple #11
0
def run(max_epoch=25, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    malware_labels = data.get_malware_labels(labels)
    all_Ys = data.expand_labels(labels)

    final_data = []

    for fold in range(nfolds):
        print "fold %u/%u" % (fold + 1, nfolds)
        train_test = train_test_split(X,
                                      labels,
                                      *all_Ys,
                                      test_size=0.2,
                                      stratify=labels)
        X_train, X_test, label_train, label_test, y_train, y_test = train_test[:
                                                                               6]
        dga_training_test = train_test[6:]

        all_Y_train = [y_train]
        for idx in range(0, len(dga_training_test), 2):
            all_Y_train.append(dga_training_test[idx])

        print 'Build model...'
        model = build_model(max_features,
                            maxlen,
                            num_targets=len(malware_labels) + 1)

        print "Train..."
        train_test = train_test_split(X_train,
                                      *all_Y_train,
                                      test_size=0.05,
                                      stratify=label_train)
        X_train, X_holdout, y_train, y_holdout = train_test[:4]
        dga_training_test = train_test[4:]
        all_Y_train = [y_train]
        for idx in range(0, len(dga_training_test), 2):
            all_Y_train.append(dga_training_test[idx])

        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train,
                      data.y_list_to_dict(all_Y_train),
                      batch_size=batch_size,
                      epochs=1)

            t_probs = model.predict(X_holdout)[0]
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict(X_test)[0]

                out_data = {
                    'y':
                    y_test,
                    'labels':
                    label_test,
                    'probs':
                    probs,
                    'epochs':
                    ep,
                    'confusion_matrix':
                    sklearn.metrics.confusion_matrix(y_test, probs > .5)
                }

                print sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 2:
                    break

        final_data.append(out_data)

    return final_data
Exemple #12
0
def run(max_epoch=30, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    indata = data.get_data()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Create feature vectors
    print "vectorizing data"
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char',
                                                               ngram_range=(2,
                                                                            2))
    count_vec = ngram_vectorizer.fit_transform(X)

    max_features = count_vec.shape[1]
    #count_vec = np.expand_dims(count_vec, axis=2)

    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]

    final_data = []

    accuracy = 0.0
    recall = 0.0
    f1 = 0.0

    for fold in range(nfolds):
        print "fold %u/%u" % (fold + 1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            count_vec, y, labels, test_size=0.2)

        for kernelSize in (500, ):
            print 'Build model...'
            model = build_model(max_features, kernelSize)

            print "Train..."
            X_train, X_holdout, y_train, y_holdout = train_test_split(
                X_train, y_train, test_size=0.05)

            best_iter = -1
            best_auc = 0.0
            out_data = {}

            for ep in range(max_epoch):

                model.fit(np.expand_dims(X_train.todense(), axis=2),
                          y_train,
                          batch_size=batch_size,
                          nb_epoch=1)

                t_probs = model.predict_proba(
                    np.expand_dims(X_holdout.todense(), axis=2))

                t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

                print 'Epoch %d: auc = %f (best=%f)\n' % (ep, t_auc, best_auc)

                pre = model.predict_classes(
                    np.expand_dims(X_test.todense(), axis=2))
                accuracyT = sklearn.metrics.accuracy_score(y_test, pre)
                recallT = sklearn.metrics.recall_score(y_test, pre)
                f1T = sklearn.metrics.f1_score(y_test, pre)
                print 'evaluate:', str(accuracyT), ' ', str(recallT), ' ', str(
                    f1T), '\n'

                if t_auc > best_auc:
                    best_auc = t_auc
                    best_iter = ep

                    probs = model.predict_proba(
                        np.expand_dims(X_test.todense(), axis=2))
                    pre = model.predict_classes(
                        np.expand_dims(X_test.todense(), axis=2))

                    out_data = {
                        'y':
                        y_test,
                        'labels':
                        label_test,
                        'probs':
                        probs,
                        'epochs':
                        ep,
                        'confusion_matrix':
                        sklearn.metrics.confusion_matrix(y_test, probs > .5)
                    }

                    accuracy = sklearn.metrics.accuracy_score(y_test, pre)
                    recall = sklearn.metrics.recall_score(y_test, pre)
                    f1 = sklearn.metrics.f1_score(y_test, pre)

                else:
                    # No longer improving...break and calc statistics
                    if (ep - best_iter) > 5:
                        break

            final_data.append(out_data)

            with open("f1result.txt", 'a') as f:
                f.write("##CNN result: \n")
                f.write("kernelsize: ")
                f.write(str(kernelSize))
                f.write(":accuracy: ")
                f.write(str(accuracy))
                f.write("; recall: ")
                f.write(str(recall))
                f.write("; f1: ")
                f.write(str(f1))
                f.write("\n")

    return final_data
Exemple #13
0
def run(max_epoch=50,
        batch_size=128,
        cata_split=True,
        multi_class=False,
        ratio=None,
        data_cache=False):
    global filepath
    """Run train/test on logistic regression model"""
    print('method is LSTM(SLD) + One-Hot(TLD)')
    indata = data.get_data()

    # Extract data and labels
    randnum = 1
    random.seed(randnum)
    random.shuffle(indata)
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]
    tops = [x[2] for x in indata]

    # One hot top domain one-hot features
    le = LabelEncoder()
    new_tops = le.fit_transform(tops)
    new_tops = new_tops.reshape(-1, 1)
    top_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    top_enc.fit(new_tops)
    feature_top = top_enc.transform(new_tops)
    tld_feature_dimension = feature_top.shape[1]
    print('tld_feature_dimension(the number of top domain feature)',
          tld_feature_dimension)

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}
    char_feature_dimension = len(valid_chars) + 1
    max_seq_len = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=max_seq_len)
    print('X shape', X.shape)

    # Convert labels to 0-1
    if not multi_class:
        y = [0 if x == 'benign' else 1 for x in labels]
        class_num = 2
    else:
        DATA_FILE = 'class_dict.pkl'
        fopen = open(DATA_FILE, 'rb')
        label_dict = pickle.load(fopen)
        print('label_dict', label_dict)
        class_num = pickle.load(fopen)
        print('class_num', class_num)
        y = []
        if ratio != None:
            max_class_num = int(class_num * ratio) + 1
            print('max_class_num', max_class_num)
            for x in labels:
                if label_dict[x] > class_num * ratio:
                    y.append(max_class_num)
                else:
                    y.append(label_dict[x])
        else:
            y = [label_dict[x] for x in labels]
        print('y', y[:100])
        print('first_ y', Counter(y))
        y = to_categorical(y, num_classes=class_num)

    # concatenate all the features
    X = np.concatenate((X, feature_top), axis=-1)
    print('X shape(after feature concatenate)', X.shape)

    final_data = []
    print('cata_split = ', cata_split)
    fwrite = open('bigram_lstm_fe.log', 'w')

    if data_cache == True and os.path.isfile('bigram_lstm_fe_top_data.npz'):
        np_data = np.load('bigram_lstm_fe_top_data.npz')
        X_train = np_data['X_train']
        y_train = np_data['y_train']
        X_test = np_data['X_test']
        y_test = np_data['y_test']
        label_test = np_data['label_test']
    else:
        if cata_split == False:
            print('cata split is false')
            X_train, X_test, y_train, y_test, _, label_test = train_test_split(
                X, y, labels, test_size=0.2, random_state=1)
        else:
            if ratio != None:
                X_train, X_test, y_train, y_test, label_train, label_test = split.train_test_split_as_catagory(
                    X, y, labels, cata=max_class_num)
            else:
                X_train, X_test, y_train, y_test, label_train, label_test = split.train_test_split_as_catagory(
                    X, y, labels, cata='symmi')
        print('X_train shape', X_train.shape)
        print('X shape', X.shape)
        np.savez('bigram_lstm_fe_top_data.npz',
                 X_train=X_train,
                 y_train=y_train,
                 X_test=X_test,
                 y_test=y_test,
                 label_test=label_test)
        print('bigram_lstm_fe_top_data.npz has been saved')

    print('type X_train', type(X_train))
    print('Build model...')
    model = build_model(char_feature_dimension, tld_feature_dimension,
                        max_seq_len, multi_class, class_num)
    print("Train...")
    best_iter = -1
    best_acc = 0.0

    for ep in range(max_epoch):
        model.fit(X_train, y_train, batch_size=batch_size, epochs=1)
        if not multi_class:
            t_probs = model.predict_proba(X_test)
            t_acc = sklearn.metrics.accuracy_score(y_test, t_probs > .5)
            print('Epoch %d: acc = %f (best=%f)' % (ep, t_acc, best_acc))
            fwrite.write('Epoch %d: acc = %f (best=%f)\n' %
                         (ep, t_acc, best_acc))
            probs = t_probs
            print('test confusion matrix')
            print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
        else:
            score = model.evaluate(X_test, y_test, verbose=0)
            t_acc = score[1]
            print('Epoch %d: acc = %f (best=%f)' % (ep, t_acc, best_acc))
            fwrite.write('Epoch %d: acc = %f (best=%f)\n' %
                         (ep, t_acc, best_acc))
        if t_acc > best_acc:
            best_acc = t_acc
            best_iter = ep
            model.save(filepath)
            opt_model = model
            print('newest model has been saved')
    fwrite.close()
Exemple #14
0
def run(max_epoch=25, nfolds=1, batch_size=1024 * 4):
    """Run train/test on model"""
    indata = data.get_data()
    domain_list = pd.read_csv("proxy_log_04_10.csv")
    domain_list = domain_list.dropna()

    # Extract data and labels
    X = [x[1] for x in indata]
    labels = [x[0] for x in indata]

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}
    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    #save the valid_chars as pickle file
    #valid_chars_pkl = open('valid_chars.pkl','wb')
    #pickle.dump(valid_chars, valid_chars_pkl)

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)
    print max_features, maxlen

    domain_list['tld'] = domain_list['domain'].apply(
        lambda x: tldextract.extract(x).domain)
    test_data = domain_list['tld'].tolist()
    test_data1 = [[valid_chars[c] for c in x] for x in test_data]
    test_data2 = sequence.pad_sequences(test_data1, maxlen=maxlen)
    # Convert labels to 0-1
    y = [0 if x == 'benign' else 1 for x in labels]
    print 'length of y'
    print len(y)
    final_data = []

    for fold in range(nfolds):
        print "fold %u/%u" % (fold + 1, nfolds)
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(
            X, y, labels, test_size=0.2)
        print 'Build model...'
        model = build_model(max_features, maxlen)
        print "Train..."
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print 'Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc)

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict(X_test)

                out_data = {
                    'y': y_test,
                    'labels': label_test,
                    'probs': probs,
                    'epochs': ep
                }
                # 'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}
                proxy_probs = model.predict(test_data2)
                for i in range(len(proxy_probs)):
                    print test_data[i], proxy_probs[i, 0]
            # print sklearn.metrics.confusion_matrix(y_test, probs > .5)
            else:
                # No longer improving...break and calc statistics
                if (ep - best_iter) > 2:
                    break
        model.summary()
        final_data.append(out_data)

    return final_data