def cmd_select_features(): print_banner("Selecting features") global selected_features sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type) selected_features = train.select_features(X, Y)
def create_png(graph): graph_dir = config.get_str('CFG', 'GraphDirectory') basename = os.path.basename(graph.name) dot_file = os.path.join(graph_dir, '%s.dot' % basename) png_file = os.path.join(graph_dir, '%s.png' % basename) # Write DOT file nx.nx_pydot.write_dot(graph, dot_file) # Convert DOT to PNG os.system("dot -Tpng %s >%s" % (dot_file, png_file))
def cmd_calibrate_model(): global model sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'tuning_set', language, vuln_type, selected_features) X = sync_features(X) model = CalibratedClassifierCV(model, method='isotonic', cv='prefit') model.fit(X, Y)
def cmd_test_model(): print_banner("Testing model") global X_test, Y_test sel_ds = config.get_str('dataset', 'SelectedDataset') X_test, Y_test = transform.get_xy(sel_ds, 'testing_set', language, vuln_type, selected_features) X_test = sync_features(X_test) print_metrics(model=model, X=X_test, Y=Y_test)
def cmd_store_all(): print_banner("Store all") global model sel_ds = config.get_str('dataset', 'SelectedDataset') orig, X, Y = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) X = sync_features(X) data.store_data(model, orig, X, Y, just_outliers=False)
def cmd_filter_features(): print_banner("Filtering features") global selected_features start_string = config.get_str('model', 'FeatureFilterStartString') if selected_features is None: sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type) selected_features = X.columns.values selected_features = [ feature for feature in selected_features if not feature.startswith(start_string) ] n = 1 for feature in selected_features: print_notice("%d. %s" % (n, feature)) n += 1
def cmd_compare_tools(): global train_features print_banner("Comparing results") sel_ds = config.get_str('dataset', 'SelectedDataset') sel_vt = config.get_str('dataset', 'SelectedVulnerabilityType') if train_features is None: X, _ = transform.get_xy(sel_ds, 'training_set', language, vuln_type, selected_features) X.sort_index(axis=1, inplace=True) train_features = X.columns orig_tuning, X_tuning, _ = transform.get_xy_with_orig( sel_ds, 'tuning_set', language, vuln_type, selected_features) X_tuning = sync_features(X_tuning) c = find_best_threshold(model, orig_tuning, X_tuning) print_notice("Preferred threshold (Y > c): %.2f" % c) orig, X, _ = transform.get_xy_with_orig(sel_ds, 'testing_set', language, vuln_type, selected_features) print_notice('-' * 55) print_notice("Our results") print_model_results(model, orig, X, c) for (tool, file_name) in config.get_items('tools'): print_notice('-' * 55) print_notice('Comparing against tool: %s' % tool) compare_results(file_name, orig, sel_vt)
def select_model(language, vuln_type, X, Y): model_type = config.get_str('model', 'Model') params = config.get_dict('model', model_type + vuln_type + 'Params', optional=True) model = create_model(model_type, params) model.fit(X, Y) if model_type == "DecisionTreeClassifier" and config.get_boolean( 'model', 'GenerateDecisionTreeGraph'): create_dt_graph("%s_%s" % (language, vuln_type), model, X.columns.values) return model
def cmd_create_model(): print_banner("Creating model") global model, train_features sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type, selected_features) X.sort_index(axis=1, inplace=True) if train_features is None: train_features = X.columns model = train.select_model(language, vuln_type, X, Y)
def create_dt_graph(title, model, features): graph_dir = config.get_str('model', 'DecisionTreeGraphDirectory') dot_file = os.path.join(graph_dir, '%s.dot' % title) png_file = os.path.join(graph_dir, '%s.png' % title) print_notice("Creating Decision Tree graph in %s" % png_file) # Write DOT file tree.export_graphviz(model, out_file=dot_file, feature_names=features, filled=True, rounded=True, proportion=True, node_ids=True) # Convert DOT to PNG os.system("dot -Tpng %s >%s" % (dot_file, png_file))
def cmd_tune_params(): print_banner("Tuning model parameters") global model, train_features sel_ds = config.get_str('dataset', 'SelectedDataset') X, Y = transform.get_xy(sel_ds, 'training_set', language, vuln_type, selected_features) X.sort_index(axis=1, inplace=True) if train_features is None: train_features = X.columns X_tuning, Y_tuning = transform.get_xy(sel_ds, 'tuning_set', language, vuln_type, selected_features) X_tuning = sync_features(X_tuning) train.select_best_model(X, Y, X_tuning, Y_tuning)
def cmd_count_sets(): sel_ds = config.get_str('dataset', 'SelectedDataset') _, Y_training = transform.get_xy(sel_ds, 'training_set', language, vuln_type, None) _, Y_tuning = transform.get_xy(sel_ds, 'tuning_set', language, vuln_type, None) _, Y_testing = transform.get_xy(sel_ds, 'testing_set', language, vuln_type, None) non_vuln = 0 vuln = 0 for setname, df in zip(['training', 'tuning', 'testing'], [Y_training, Y_tuning, Y_testing]): nv = len(df.loc[df[0:] == 0]) v = len(df.loc[df[0:] == 1]) non_vuln += nv vuln += v print_notice("%s set: non-vulnerable lines %d, vulnerable lines %d" % (setname, nv, v)) print_notice("total: non-vulnerable lines %d, vulnerable lines %d" % (non_vuln, vuln))
def __init__(self): super(CustomDataset, self).__init__(config.get_str('analysis', 'CustomPickle'))
def cmd_clean_set(): print_banner("Cleaning sets") sel_ds = config.get_str('dataset', 'SelectedDataset') dataset_factory.get_dataset(sel_ds).delete_sets()
def cmd_create_transform(): print_banner("Transforming sets") sel_ds = config.get_str('dataset', 'SelectedDataset') transform.transform_sets(sel_ds, sets, language)
def get_transform_filename(dataset, language, vuln_type): filename_format = config.get_str('dataset', 'TransformFilenameFormat') return filename_format % (dataset, language, vuln_type)
def get_features_filename(dataset, language, vuln_type): filename_format = config.get_str('dataset', 'FeaturesFilenameFormat') return filename_format % (dataset, language, vuln_type)
def __init__(self): super(SamateDataset, self).__init__(config.get_str('SAMATE', 'SamatePickle'))