def getClusterLabelsFamilies(experiment_id, selected_cluster): selected_cluster = int(selected_cluster) experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.from_json(experiment.output_dir()) labels_families = clustering.getClusterLabelsFamilies( experiment, selected_cluster) return jsonify(labels_families)
def getPredictions(experiment_id, train_test, fold_id, index, label): experiment = updateCurrentExperiment(experiment_id) directory = experiment.getOutputDirectory() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = path.join(directory, 'predictions.csv') index = int(index) min_value = index * 0.1 max_value = (index + 1) * 0.1 with open(filename, 'r') as f: data = pd.read_csv(f, header=0, index_col=0) selection = data.loc[:, 'predicted_proba'] >= min_value data = data.loc[selection, :] selection = data.loc[:, 'predicted_proba'] <= max_value data = data.loc[selection, :] if label != 'all': if label == 'malicious': selection = data.loc[:, 'ground_truth'] == True elif label == 'benign': selection = data.loc[:, 'ground_truth'] == False data = data.loc[selection, :] selected_instances = [int(x) for x in list(data.index.values)] proba = list(data['predicted_proba']) return jsonify({'instances': selected_instances, 'proba': proba})
def activeLearningSuggestionsMonitoring(experiment_id, iteration): iteration = int(iteration) experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.output_dir(), str(iteration - 1), 'suggestions_accuracy', 'labels_families_high_confidence_suggestions.png') return send_file(filename)
def getTestExperimentId(experiment_id): experiment = updateCurrentExperiment(experiment_id) exp_filename = path.join(experiment.getOutputDirectory(), 'test_experiment.txt') with open(exp_filename, 'r') as f: test_experiment_id = f.readline() return test_experiment_id
def getStatsPlot(experiment_id, plot_type, feature): exp = updateCurrentExperiment(experiment_id) if plot_type.find('histogram') >= 0: filename = plot_type + '.json' else: filename = plot_type + '.png' return send_file(path.join(exp.output_dir(), feature, filename))
def getAlertsClusteringExperimentId(experiment_id, fold_id): experiment = updateCurrentExperiment(experiment_id) directory = experiment.getOutputDirectory() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) filename = path.join(directory, 'alerts', 'clusters.json') return send_file(filename)
def getTopModelFeatures(exp_id, size, train_test, fold_id): exp = updateCurrentExperiment(exp_id) directory = exp.output_dir() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = path.join(directory, 'model_coefficients.csv') with open(filename, 'r') as f: coefficients_df = pd.read_csv(f, header=0, index_col=0, nrows=int(size)) coefficients = list(coefficients_df['mean']) features_ids = coefficients_df.index tooltip_data = [] user_ids = [] for feature_id in features_ids: query = session.query(FeaturesAlchemy) query = query.filter(FeaturesAlchemy.id == int(feature_id)) row = query.one() tooltip_data.append(row.name) user_ids.append(row.user_id) barplot = BarPlot(user_ids) dataset = PlotDataset(coefficients, None) score = exp.exp_conf.core_conf.classifier_conf.featureImportance() if score == 'weight': dataset.set_color(colors_tools.red) barplot.add_dataset(dataset) return jsonify(barplot.to_json(tooltip_data=tooltip_data))
def getInstancesToAnnotate(experiment_id, iteration, predicted_label): experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), str(iteration), 'toannotate_' + predicted_label + '.csv') df = pd.read_csv(filename) queries = [int(x) for x in df.instance_id] return jsonify({'instances': queries})
def getFeatures(experiment_id, instance_id): instance_id = int(instance_id) experiment = updateCurrentExperiment(experiment_id) features_names, features_values = experiment.getFeatures(instance_id) features = {features_names[i]: features_values[i] for i in range(len(features_names))} return jsonify(features)
def getFeatures(experiment_id, instance_id): instance_id = int(instance_id) experiment = updateCurrentExperiment(experiment_id) features_from_exp = FeaturesFromExp(experiment) features_names, features_values = features_from_exp.get_instance( instance_id) features = {features_names[i]: v for i, v in enumerate(features_values)} return jsonify(features)
def getNumComponents(experiment_id): experiment = updateCurrentExperiment(experiment_id) directory = experiment.getOutputDirectory() filename = 'projection_matrix.csv' with open(path.join(directory, filename), 'r') as f: header = f.readline() num_components = len(header.split(',')) - 1 return str(num_components)
def getNumElements(experiment_id, selected_cluster): selected_cluster = int(selected_cluster) experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.from_json(experiment.output_dir()) cluster = clustering.clusters[selected_cluster] res = {} res['num_elements'] = cluster.numInstances() return jsonify(res)
def getTopWeightedFeatures(experiment_id, inst_exp_id, instance_id, size, fold_id): if fold_id == 'all': return None instance_id = int(instance_id) exp = updateCurrentExperiment(experiment_id) validation_experiment = updateCurrentExperiment(inst_exp_id) # get the features features_names, features_values = validation_experiment.getFeatures( instance_id) features_values = [float(value) for value in features_values] # get the pipeline with scaler and logistic model experiment_dir = exp.getOutputDirectory() if fold_id != 'None': experiment_dir = path.join(experiment_dir, fold_id) pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out')) # scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = list( map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0])) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] features_names = [x[0] for x in features] features_values = [x[1] for x in features] features_weighted_values = [x[2] for x in features] max_length = max([len(f) for f in features_names]) if max_length > 30: labels = [str(i) for i in range(len(features_names))] tooltips = [ features_names[i] + ' (' + str(features_values[i]) + ')' for i in range(len(features_names)) ] else: labels = features_names tooltips = features_values barplot = BarPlot(labels) dataset = PlotDataset(features_weighted_values, None) dataset.setColor(colors_tools.red) barplot.addDataset(dataset) return jsonify(barplot.toJson(tooltip_data=tooltips))
def getSortingCriteria(experiment_id): exp = updateCurrentExperiment(experiment_id) scoring_filename = path.join(exp.output_dir(), 'scores.csv') scores = pd.read_csv(scoring_filename, header=0, index_col=0) criteria = scores.columns.values.tolist() criteria = list(set([c.split('_pvalues')[0] for c in criteria])) criteria.extend(['alphabet', 'null_variance']) criteria.sort() return jsonify({'criteria': criteria})
def getClustersLabels(experiment_id): experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.from_json(experiment.output_dir()) # Do not consider empty clusters for visualization clusters = [] for c in range(clustering.num_clusters): # if clustering.clusters[c].numInstances() > 0: clusters.append({'id': c, 'label': clustering.clusters[c].label}) return jsonify({'clusters': clusters})
def getClusterInstancesVisu(exp_id, selected_cluster, c_e_r, num_results): num_results = int(num_results) selected_cluster = int(selected_cluster) exp = updateCurrentExperiment(exp_id) clustering = ClusteringExp.from_json(exp.output_dir()) ids = {} ids[selected_cluster] = clustering.getClusterInstancesVisu( selected_cluster, num_results, random=True)[c_e_r] return jsonify(ids)
def getClusterLabelFamilyIds(exp_id, selected_cluster, label, family, num_results): selected_cluster = int(selected_cluster) num_results = int(num_results) exp = updateCurrentExperiment(exp_id) clustering = ClusteringExp.from_json(exp.output_dir()) ids = clustering.getClusterLabelFamilyIds(exp, selected_cluster, label, family) return jsonify(listResultWebFormat(ids, num_results))
def getIterationSupervisedExperiment(experiment_id, iteration): experiment = updateCurrentExperiment(experiment_id) binary_multiclass = 'multiclass' if 'binary' in list(experiment.conf.models_conf.keys()): binary_multiclass = 'binary' models_exp_file = path.join(experiment.getOutputDirectory(), str(iteration), 'models_experiments.json') with open(models_exp_file, 'r') as f: models_exp = json.load(f) return str(models_exp[binary_multiclass])
def getInstancesToAnnotate(experiment_id, iteration, predicted_label): experiment = updateCurrentExperiment(experiment_id) if predicted_label == 'None': filename = 'toannotate.csv' else: filename = 'toannotate_%s.csv' % predicted_label filename = path.join(experiment.getOutputDirectory(), iteration, filename) df = pd.read_csv(filename) queries = [int(x) for x in df.instance_id] return jsonify({'instances': queries})
def getClusterLabelFamilyIds(experiment_id, selected_cluster, label, family, num_results): selected_cluster = int(selected_cluster) num_results = int(num_results) experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.fromJson(experiment.getOutputDirectory()) ids = clustering.getClusterLabelFamilyIds(experiment, selected_cluster, label, family) res = listResultWebFormat(ids, num_results) return jsonify(res)
def getLabelsMonitoring(experiment_id, iteration): experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.output_dir(), str(iteration), 'labels_monitoring', 'labels_monitoring.json') with open(filename, 'r') as f: stats = json.load(f) res = {} res['unlabeled'] = stats['unlabeled'] res['annotations'] = stats['global']['annotations'] return jsonify(res)
def getClusterInstancesVisu(experiment_id, selected_cluster, c_e_r, num_results): num_results = int(num_results) selected_cluster = int(selected_cluster) experiment = updateCurrentExperiment(experiment_id) clustering = ClusteringExp.fromJson(experiment.getOutputDirectory()) selected_cluster_ids = {} selected_cluster_ids[selected_cluster] = \ clustering.getClusterInstancesVisu( selected_cluster, num_results, random=True)[c_e_r] return jsonify(selected_cluster_ids)
def supervisedLearningMonitoring(exp_id, train_test, kind, fold_id): directory = updateCurrentExperiment(exp_id).output_dir() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = kind if kind in ['ROC', 'false_discovery_recall_curve']: filename += '.png' else: filename += '.json' return send_file(path.join(directory, filename))
def getIterationSupervisedExperiment(exp_id, iteration): experiment = updateCurrentExperiment(exp_id) binary_multiclass = 'multiclass' if 'binary' in experiment.exp_conf.core_conf.models_conf.__dict__: binary_multiclass = 'binary' models_exp_file = path.join(experiment.output_dir(), str(iteration), 'models_experiments.json') with open(models_exp_file, 'r') as f: models_exp = json.load(f) return str(models_exp[binary_multiclass])
def getInstance(experiment_id, view_id, instance_id, ident): try: if view_id == 'None': view_id = None experiment = updateCurrentExperiment(experiment_id) project = experiment.project module = importlib.import_module( 'SecuML.web.views.Projects.' + project) return module.getInstance(experiment, view_id, instance_id, ident) except IOError as e: app.logger.error(e) return 'Unable to display the instance', ident
def supervisedLearningMonitoring(experiment_id, train_test, kind, fold_id): experiment = updateCurrentExperiment(experiment_id) directory = experiment.getOutputDirectory() if fold_id != 'None' and fold_id != 'all': directory = path.join(directory, fold_id) directory = path.join(directory, train_test) filename = kind if kind == 'ROC': filename += '.png' else: filename += '.json' return send_file(path.join(directory, filename))
def getTopWeightedFeatures(exp_id, inst_exp_id, instance_id, size, fold_id): if fold_id == 'all': return None instance_id = int(instance_id) exp = updateCurrentExperiment(exp_id) inst_exp = updateCurrentExperiment(inst_exp_id) # get the features features_from_exp = FeaturesFromExp(inst_exp) features_names, features_values = features_from_exp.get_instance( instance_id) features_values = [float(value) for value in features_values] # get the pipeline with scaler and logistic model experiment_dir = exp.output_dir() if fold_id != 'None': experiment_dir = path.join(experiment_dir, fold_id) pipeline = joblib.load(path.join(experiment_dir, 'model', 'model.out')) # scale the features scaled_values = pipeline.named_steps['scaler'].transform( np.reshape(features_values, (1, -1))) weighted_values = np.multiply(scaled_values, pipeline.named_steps['model'].coef_) features = list( map(lambda name, value, w_value: (name, value, w_value), features_names, features_values, weighted_values[0])) features.sort(key=lambda tup: abs(tup[2])) features = features[:-int(size) - 1:-1] features_names = [x[0] for x in features] features_values = [x[1] for x in features] features_weighted_values = [x[2] for x in features] labels = [str(name) for name in features_names] tooltips = [ '%s (%.2f)' % (name, features_values[i]) for i, name in enumerate(features_names) ] barplot = BarPlot(labels) dataset = PlotDataset(features_weighted_values, None) dataset.set_color(colors_tools.red) barplot.add_dataset(dataset) return jsonify(barplot.to_json(tooltip_data=tooltips))
def activeLearningModelsMonitoring(experiment_id, iteration, train_cv_validation): experiment = updateCurrentExperiment(experiment_id) binary_multiclass = 'multiclass' estimator = 'accuracy' if 'binary' in experiment.exp_conf.core_conf.models_conf.__dict__: binary_multiclass = 'binary' estimator = 'auc' directory = path.join(experiment.output_dir(), str(iteration), 'models_performance') filename = '_'.join( [binary_multiclass, train_cv_validation, estimator, 'monitoring.png']) return send_file(path.join(directory, filename), mimetype='image/png')
def removeAnnotation(exp_id, inst_exp_id, instance_id): annotations_db_tools.removeAnnotation(session, inst_exp_id, instance_id) session.commit() if user_exp: exp = updateCurrentExperiment(exp_id) filename = path.join(exp.output_dir(), 'user_actions.log') file_exists = path.isfile(filename) mode = 'a' if file_exists else 'w' to_print = ','.join( map(str, [datetime.datetime.now(), 'removeAnnotation', instance_id])) with open(filename, mode) as f: f.write(to_print) return ''
def runNextIteration(experiment_id, iteration_number): res = str(celeryRunNextIteration.s().apply_async()) if user_exp: experiment = updateCurrentExperiment(experiment_id) filename = path.join(experiment.getOutputDirectory(), 'user_actions.log') file_exists = dir_tools.checkFileExists(filename) mode = 'a' if file_exists else 'w' to_print = [datetime.datetime.now(), 'nextIteration', iteration_number] to_print = list(map(str, to_print)) to_print = ','.join(to_print) with open(filename, mode) as f: f.write(to_print) return res