Example #1
0
def test_extract_features():
    data = {'id':[1], 'name':[1], 'image':[1], 'thumbnail':[1], 'artists':[1], 'publishers':[1],
                           'designers':[1], 'description':[1], 'categories':[1], 'mechanics':[1], 'column':[1]}

    df = pd.DataFrame(data)

    assert extract_features(df).columns == ['column']
Example #2
0
def test_standardize_features_empty_df():

    # Bad data
    data = {'col1':[1], 'col2':[2]}
    df = pd.DataFrame(data)


    for col in df.columns:
        assert list(df[col]) == list(extract_features(df)[col])
Example #3
0
def test_extract_features_missing_columns():
    # bad data missing id column, whcih is supposed to be removed
    data = {'name': [1], 'image': [1], 'thumbnail': [1], 'artists': [1], 'publishers': [1],
            'designers': [1], 'description': [1], 'categories': [1], 'mechanics': [1], 'column': [1]}

    df = pd.DataFrame(data)
    # Supposed to return same df with the same columns
    for col in df.columns:
        assert list(df[col]) == list(extract_features(df)[col])
Example #4
0
def classify_domain(domain, model_path='models/model.pickle'):
    data = {'host': [domain], 'domain': [domain.split('.')[0]]}
    df = pd.DataFrame.from_dict(data)
    X = extract_features(df)

    y_pred = pred(X, model_path)

    result = 'dga' if y_pred[0] else 'legit'
    print(f'Prediction for {domain}: {result}')
Example #5
0
def train_model(data_path='data/dga_domains.csv',
                model_path='models/model.pickle'):
    df = pd.read_csv(data_path)
    X = extract_features(df)
    y = (df['class'] == 'dga').astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    train(X_train, y_train, model_path)

    y_pred = pred(X_test, model_path)
    print(classification_report(y_test, y_pred))
Example #6
0
def test_extract_features():
    data = {'host': ['235ovnaewi.ru'], 'domain': ['235ovnaewi']}
    df = pd.DataFrame.from_dict(data)
    features = extract_features(df)
    assert all([a == b for a, b in zip(features[0], [10, 0.4, 0, 1])])
    unfeaturized_data = ft.load_unfeaturized_data(args.local_filepath)

    # Convert the unfeaturized data dictionary to pandas DataFrame
    df = pd.DataFrame(unfeaturized_data)

    # Calling wrapper function to create categories features
    featurized_categories_data = ft.wrapper(df, 'categories', config['featurize']['index_categories'])

    # Calling wrapper function to create mechanics features
    featurized_mechanics_data = ft.wrapper(featurized_categories_data, 'mechanics', config['featurize']['index_mechanics'])

    # Extract relevant information from the 'stats' column (which contains dictionaries) into new columns, then drop it
    featurized_data = ft.extract_stats(featurized_mechanics_data)

    # Extract relevant feature columns for modelling
    features_df = md.extract_features(featurized_data)

    # Standardize data and return feature matrix (numpy array)
    X = md.standardize_features(features_df)

    # Fit KMeans model
    model = md.fit_kmeans(X, **config['model']['kmeans'])

    # Calculate labels for data
    labels = md.model_predict(X, model)

    # Calculate Silhouette score for fitted model and labels for the training data
    silhouette_score_ = md.evaluate_silhouette(X, labels)

    # Combining the original df with the labels and dropping unnecessary columns (now that the modelling is done)
    df = md.combine_with_labels(featurized_data, labels)