def test_extract_features(): data = {'id':[1], 'name':[1], 'image':[1], 'thumbnail':[1], 'artists':[1], 'publishers':[1], 'designers':[1], 'description':[1], 'categories':[1], 'mechanics':[1], 'column':[1]} df = pd.DataFrame(data) assert extract_features(df).columns == ['column']
def test_standardize_features_empty_df(): # Bad data data = {'col1':[1], 'col2':[2]} df = pd.DataFrame(data) for col in df.columns: assert list(df[col]) == list(extract_features(df)[col])
def test_extract_features_missing_columns(): # bad data missing id column, whcih is supposed to be removed data = {'name': [1], 'image': [1], 'thumbnail': [1], 'artists': [1], 'publishers': [1], 'designers': [1], 'description': [1], 'categories': [1], 'mechanics': [1], 'column': [1]} df = pd.DataFrame(data) # Supposed to return same df with the same columns for col in df.columns: assert list(df[col]) == list(extract_features(df)[col])
def classify_domain(domain, model_path='models/model.pickle'): data = {'host': [domain], 'domain': [domain.split('.')[0]]} df = pd.DataFrame.from_dict(data) X = extract_features(df) y_pred = pred(X, model_path) result = 'dga' if y_pred[0] else 'legit' print(f'Prediction for {domain}: {result}')
def train_model(data_path='data/dga_domains.csv', model_path='models/model.pickle'): df = pd.read_csv(data_path) X = extract_features(df) y = (df['class'] == 'dga').astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) train(X_train, y_train, model_path) y_pred = pred(X_test, model_path) print(classification_report(y_test, y_pred))
def test_extract_features(): data = {'host': ['235ovnaewi.ru'], 'domain': ['235ovnaewi']} df = pd.DataFrame.from_dict(data) features = extract_features(df) assert all([a == b for a, b in zip(features[0], [10, 0.4, 0, 1])])
unfeaturized_data = ft.load_unfeaturized_data(args.local_filepath) # Convert the unfeaturized data dictionary to pandas DataFrame df = pd.DataFrame(unfeaturized_data) # Calling wrapper function to create categories features featurized_categories_data = ft.wrapper(df, 'categories', config['featurize']['index_categories']) # Calling wrapper function to create mechanics features featurized_mechanics_data = ft.wrapper(featurized_categories_data, 'mechanics', config['featurize']['index_mechanics']) # Extract relevant information from the 'stats' column (which contains dictionaries) into new columns, then drop it featurized_data = ft.extract_stats(featurized_mechanics_data) # Extract relevant feature columns for modelling features_df = md.extract_features(featurized_data) # Standardize data and return feature matrix (numpy array) X = md.standardize_features(features_df) # Fit KMeans model model = md.fit_kmeans(X, **config['model']['kmeans']) # Calculate labels for data labels = md.model_predict(X, model) # Calculate Silhouette score for fitted model and labels for the training data silhouette_score_ = md.evaluate_silhouette(X, labels) # Combining the original df with the labels and dropping unnecessary columns (now that the modelling is done) df = md.combine_with_labels(featurized_data, labels)