def get_naive_bayes_conf(self): name = '-'.join([ 'AL%d' % self.exp.exp_id, 'Iter%d' % self.iteration.iter_num, 'all', 'NaiveBayes' ]) classifier_conf = self.exp.exp_conf.core_conf.classifier_conf optim_conf = classifier_conf.hyperparam_conf.optim_conf multiclass = True factory = classifiers.get_factory() naive_bayes_conf = factory.get_default('GaussianNaiveBayes', optim_conf.num_folds, optim_conf.n_jobs, multiclass, self.exp.logger) test_conf = UnlabeledLabeledConf(self.exp.logger) classification_conf = ClassificationConf(naive_bayes_conf, test_conf, self.exp.logger) features_conf = FeaturesConf( self.exp.exp_conf.features_conf.input_features, self.exp.exp_conf.features_conf.sparse, self.exp.exp_conf.features_conf.logger, filter_in_f=self.exp.exp_conf.features_conf.filter_in_f, filter_out_f=self.exp.exp_conf.features_conf.filter_out_f) exp_conf = DiademConf(self.exp.exp_conf.secuml_conf, self.exp.exp_conf.dataset_conf, features_conf, self.exp.exp_conf.annotations_conf, classification_conf, None, name=name, parent=self.exp.exp_id) DiademExp(exp_conf, session=self.exp.session) return naive_bayes_conf
def gen_parser(): parser = argparse.ArgumentParser( description='Learn a detection model. ' 'The ground-truth must be stored in ' 'annotations/ground_truth.csv.') ExpConf.gen_parser(parser) ClassificationConf.gen_parser(parser) factory = classifiers.get_factory() models = factory.get_methods() models.remove('AlreadyTrained') subparsers = parser.add_subparsers(dest='model_class') subparsers.required = True for model in models: model_parser = subparsers.add_parser(model) factory.gen_parser(model, model_parser) classifier_type = get_classifier_type(factory.get_class(model)) if classifier_type in [ClassifierType.supervised, ClassifierType.semisupervised]: AnnotationsConf.gen_parser( model_parser, required=False, message='CSV file containing the annotations of ' 'some or all the instances.') # Add subparser for already trained model already_trained = subparsers.add_parser('AlreadyTrained') factory.gen_parser('AlreadyTrained', already_trained) return parser
def gen_parser(parser): alerts_group = parser.add_argument_group('Alerts parameters') alerts_group.add_argument( '--detection-threshold', type=float, default=0.5, help='An alert is triggered if the predicted probability of ' 'maliciousness is above this threshold. ' 'Default: 0.5.') group = alerts_group.add_mutually_exclusive_group(required=False) models = classifiers.get_factory().get_methods( ClassifierType.supervised) group.add_argument('--alerts-classif', default=None, choices=models, help='Supervised model trained to cluster the ' 'alerts according to the malicious families ' 'defined in the training dataset. ' 'Default: None.') group.add_argument('--alerts-clustering', default=None, choices=cluster_conf.get_factory().get_methods(), help='Clustering algorithm to analyze the alerts. ' 'Default: None.') alerts_group.add_argument( '--num-alerts-clusters', type=int, default=4, help='Number of clusters built from the alerts. ' 'Default: 4.')
def _create_naive_bayes_conf(self): name = '-'.join([ 'AL%d' % (self.exp.exp_id), 'Iter%d' % (self.iteration.iter_num), 'all', 'NaiveBayes' ]) multiclass_model = self.exp.exp_conf.core_conf.multiclass_model classifier_conf = multiclass_model.classifier_conf optim_conf = classifier_conf.hyperparam_conf.optim_conf multiclass = True factory = classifiers.get_factory() naive_bayes_conf = factory.get_default('GaussianNaiveBayes', optim_conf.num_folds, optim_conf.n_jobs, multiclass, self.exp.logger) test_conf = UnlabeledLabeledConf(self.exp.logger) classif_conf = ClassificationConf(naive_bayes_conf, test_conf, self.exp.logger) DiademConf(self.exp.exp_conf.secuml_conf, self.exp.exp_conf.dataset_conf, self.exp.exp_conf.features_conf, self.exp.exp_conf.annotations_conf, classif_conf, None, name=name, parent=self.exp.exp_id) return naive_bayes_conf
def _rcd_conf(args, logger): factory = classifiers.get_factory() classifier_conf = factory.get_default('LogisticRegression', None, None, True, logger) classif_conf = ClassificationConf(classifier_conf, UnlabeledLabeledConf(logger), logger) return RcdStrategyConf(classif_conf, args.cluster_strategy, args.num_annotations, 'uniform', logger)
def _get_lr_conf(self, validation_conf, logger, multiclass=False): factory = classifiers.get_factory() classifier_conf = factory.get_default('LogisticRegression', None, None, multiclass, logger) return ClassificationConf(classifier_conf, UnlabeledLabeledConf(logger), logger, validation_conf=validation_conf)
def gen_main_model_parser(parser): group = parser.add_argument_group('Classification model parameters') models = classifiers.get_factory().get_methods(supervised=True) group.add_argument('--model-class', choices=models, default='LogisticRegression', help='Model class trained at each iteration. ' 'Default: LogisticRegression.') HyperparamConf.gen_parser(group, None, True, subgroup=False)
def from_json(obj, logger): if obj is None: return None classifier_conf = None clustering_conf = None if obj['classifier_conf'] is not None: factory = classifiers.get_factory() classifier_conf = factory.from_json(obj['classifier_conf'], logger) elif obj['clustering_conf'] is not None: factory = cluster_conf.get_factory() clustering_conf = factory.from_json(obj['clustering_conf'], logger) return AlertsConf(obj['detection_threshold'], classifier_conf, clustering_conf, logger)
def from_args(args): secuml_conf = ExpConf.secuml_conf_from_args(args) classif_conf = ClassificationConf.from_args(args, secuml_conf.logger) model_class = classifiers.get_factory().get_class(args.model_class) classifier_type = get_classifier_type(model_class) if classifier_type in [ ClassifierType.supervised, ClassifierType.semisupervised ]: annotations_conf = AnnotationsConf(args.annotations_file, None, secuml_conf.logger) else: annotations_conf = AnnotationsConf(None, None, secuml_conf.logger) already_trained = None if args.model_class == 'AlreadyTrained': already_trained = args.model_exp_id alerts_conf = AlertsConf.from_args(args, secuml_conf.logger) if (classifier_type == ClassifierType.unsupervised and alerts_conf.classifier_conf is not None): raise InvalidInputArguments('Supervised classification of the ' 'alerts is not supported for ' 'unsupervised model classes. ') if classif_conf.classifier_conf.multiclass: if alerts_conf.with_analysis(): raise InvalidInputArguments('Alerts analysis is not supported ' 'for multiclass models. ') else: alerts_conf = None if (classif_conf.test_conf.method == 'dataset' and classif_conf.test_conf.streaming and alerts_conf.with_analysis()): raise InvalidInputArguments('Alerts analysis is not supported ' 'in streaming mode. ') dataset_conf = DatasetConf.from_args(args, secuml_conf.logger) features_conf = FeaturesConf.from_args(args, secuml_conf.logger) if (features_conf.sparse and not classif_conf.classifier_conf.accept_sparse): raise InvalidInputArguments('%s does not support sparse ' 'features. ' % args.model_class) return DiademConf(secuml_conf, dataset_conf, features_conf, annotations_conf, classif_conf, alerts_conf, name=args.exp_name, already_trained=already_trained, no_training_detection=args.no_training_detection)
def from_args(self, method, args, logger): validation_conf = None if args.validation_datasets is not None: validation_conf = ValidationDatasetsConf.from_args(args, logger) class_ = self.get_class(method) main_model_type = class_.main_model_type() main_model_conf = None if main_model_type is not None: factory = classifiers.get_factory() args.multiclass = main_model_type == 'multiclass' classifier_conf = factory.from_args(args.model_class, args, logger) test_conf = UnlabeledLabeledConf(logger) main_model_conf = ClassificationConf( classifier_conf, test_conf, logger, validation_conf=validation_conf) return class_.from_args(args, main_model_conf, validation_conf, logger)
def from_json(conf_json, secuml_conf): logger = secuml_conf.logger dataset_conf = DatasetConf.from_json(conf_json['dataset_conf'], logger) features_conf = FeaturesConf.from_json(conf_json['features_conf'], logger) annotations_conf = AnnotationsConf.from_json( conf_json['annotations_conf'], logger) factory = classifiers.get_factory() classifier_conf = factory.from_json(conf_json['core_conf'], logger) exp_conf = TestConf(secuml_conf, dataset_conf, features_conf, annotations_conf, classifier_conf, name=conf_json['name'], parent=conf_json['parent'], fold_id=conf_json['fold_id'], kind=conf_json['kind']) exp_conf.exp_id = conf_json['exp_id'] return exp_conf
def from_args(args): secuml_conf = ExpConf.secuml_conf_from_args(args) classif_conf = ClassificationConf.from_args(args, secuml_conf.logger) model_class = classifiers.get_factory().get_class(args.model_class) classifier_type = get_classifier_type(model_class) if classifier_type in [ClassifierType.supervised, ClassifierType.semisupervised]: annotations_conf = AnnotationsConf(args.annotations_file, None, secuml_conf.logger) else: annotations_conf = AnnotationsConf(None, None, secuml_conf.logger) already_trained = None if args.model_class == 'AlreadyTrained': already_trained = args.model_exp_id dataset_conf = DatasetConf.from_args(args, secuml_conf.logger) features_conf = FeaturesConf.from_args(args, secuml_conf.logger) return DiademConf(secuml_conf, dataset_conf, features_conf, annotations_conf, classif_conf, name=args.exp_name, already_trained=already_trained)
def gen_parser(): parser = argparse.ArgumentParser( description='Train and evaluate a detection ' 'model. ') ExpConf.gen_parser(parser, sparse=True) parser.add_argument('--no-training-detection', action='store_true', default=False, help='''When specified, the detection model is not applied to the training instances. ''') factory = classifiers.get_factory() models = factory.get_methods() models.remove('AlreadyTrained') subparsers = parser.add_subparsers(dest='model_class') subparsers.required = True for model in models: model_parser = subparsers.add_parser(model) factory.gen_parser(model, model_parser) classifier_type = get_classifier_type(factory.get_class(model)) if classifier_type in [ ClassifierType.supervised, ClassifierType.semisupervised ]: default = None message = '''CSV file containing the annotations of some instances, or GROUND_TRUTH to use the ground truth annotations stored in idents.csv. ''' if classifier_type == ClassifierType.supervised: default = 'GROUND_TRUTH' message = '%s Default: GROUND_TRUTH.' % message AnnotationsConf.gen_parser(model_parser, required=default is None, default=default, message=message) ClassificationConf.gen_parser(model_parser) AlertsConf.gen_parser(model_parser) # Add subparser for already trained model already_trained = subparsers.add_parser('AlreadyTrained') factory.gen_parser('AlreadyTrained', already_trained) ClassificationConf.gen_parser(already_trained) AlertsConf.gen_parser(already_trained) return parser
def from_args(args, logger): classifier_conf = None clustering_conf = None if args.alerts_classif is not None: multiclass = True num_folds = None if hasattr(args, 'num_folds'): num_folds = args.num_folds n_jobs = None if hasattr(args, 'n_jobs'): n_jobs = args.n_jobs factory = classifiers.get_factory() classifier_conf = factory.get_default(args.alerts_classif, num_folds, n_jobs, multiclass, logger) elif args.alerts_clustering is not None: factory = cluster_conf.get_factory() clustering = factory.get_class(args.alerts_clustering) clustering_conf = clustering(logger, args.num_alerts_clusters) return AlertsConf(args.detection_threshold, classifier_conf, clustering_conf, logger)