def set_anomaly_args(args, name=None, fields=None, anomaly_fields=None): """Return anomaly arguments dict """ if name is None: name = args.name if anomaly_fields is None: anomaly_fields = args.anomaly_fields_ anomaly_args = set_basic_model_args(args, name) anomaly_args.update({ "seed": SEED if args.seed is None else args.seed, "anomaly_seed": (SEED if args.anomaly_seed is None else args.anomaly_seed) }) if anomaly_fields and fields is not None: input_fields = configure_input_fields(fields, anomaly_fields) anomaly_args.update(input_fields=input_fields) if args.top_n > 0: anomaly_args.update(top_n=args.top_n) if args.forest_size > 0: anomaly_args.update(forest_size=args.forest_size) anomaly_args = update_sample_parameters_args(anomaly_args, args) if 'anomaly' in args.json_args: update_json_args(anomaly_args, args.json_args.get('anomaly'), fields) return anomaly_args
def set_association_args(args, name=None, fields=None, association_fields=None): """Return association arguments dict """ if name is None: name = args.name if association_fields is None: association_fields = args.association_fields_ association_args = set_basic_model_args(args, name) if association_fields and fields is not None: input_fields = configure_input_fields(fields, association_fields) association_args.update(input_fields=input_fields) if args.association_k: association_args.update({"max_k": args.association_k}) if args.search_strategy: association_args.update({"search_strategy": args.search_strategy}) association_args = update_sample_parameters_args(association_args, args) if 'association' in args.json_args: update_json_args(association_args, args.json_args.get('association'), fields) return association_args
def set_pca_args(args, name=None, fields=None, pca_fields=None): """Return pca arguments dict """ if name is None: name = args.name if pca_fields is None: pca_fields = args.pca_fields_ pca_args = set_basic_args(args, name) pca_args.update({ "seed": SEED if args.seed is None else args.seed, "pca_seed": SEED if args.seed is None else args.seed }) pca_args.update({"sample_rate": args.sample_rate}) pca_args = update_sample_parameters_args( \ pca_args, args) if fields is not None: input_fields = fields.fields.keys() if pca_fields and fields is not None: input_fields = configure_input_fields(fields, pca_fields) if args.exclude_objective: input_fields = [field for field in input_fields \ if field not in args.exclude_fields] pca_args.update(input_fields=input_fields) if 'pca' in args.json_args: update_json_args(pca_args, args.json_args.get('pca'), fields) return pca_args
def set_model_args(args, name=None, objective_id=None, fields=None, model_fields=None, other_label=None): """Return model arguments dict """ if name is None: name = args.name if objective_id is None and args.max_categories is None: objective_id = args.objective_id_ if args.max_categories > 0: objective_id = args.objective_field if model_fields is None: model_fields = args.model_fields_ model_args = set_basic_model_args(args, name) model_args.update({"missing_splits": args.missing_splits}) if objective_id is not None and fields is not None: model_args.update({"objective_field": objective_id}) # If evaluate flag is on and no test_split flag is provided, # we choose a deterministic sampling with # args.sample_rate (80% by default) of the data to create the model # If cross_validation_rate = n/100, then we choose to run 2 * n evaluations # by holding out a n% of randomly sampled data. if ((args.evaluate and args.test_split == 0 and args.test_datasets is None) or args.cross_validation_rate > 0): model_args.update(seed=SEED) if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate args.replacement = False elif (args.sample_rate == 1 and args.test_datasets is None and not args.dataset_off): args.sample_rate = EVALUATE_SAMPLE_RATE if model_fields and fields is not None: input_fields = configure_input_fields( fields, model_fields, by_name=(args.max_categories > 0)) model_args.update(input_fields=input_fields) if args.pruning and args.pruning != 'smart': model_args.update(stat_pruning=(args.pruning == 'statistical')) if args.node_threshold > 0: model_args.update(node_threshold=args.node_threshold) if args.balance: model_args.update(balance_objective=True) if args.split_field: model_args.update(split_field=args.split_field) if args.focus_field: model_args.update(focus_field=args.focus_field) if args.weight_field: try: weight_field = fields.field_id(args.weight_field) except ValueError, exc: sys.exit(exc) model_args.update(weight_field=weight_field)
def set_cluster_args(args, name=None, fields=None, cluster_fields=None): """Return cluster arguments dict """ if name is None: name = args.name if cluster_fields is None: cluster_fields = args.cluster_fields_ cluster_args = set_basic_model_args(args, name) cluster_args.update({ "seed": SEED if args.seed is None else args.seed, "cluster_seed": (SEED if args.cluster_seed is None else args.cluster_seed) }) if args.cluster_models is not None: cluster_args.update({"model_clusters": True}) if args.cluster_k: cluster_args.update({"k": args.cluster_k}) if cluster_fields and fields is not None: input_fields = configure_input_fields(fields, cluster_fields) cluster_args.update(input_fields=input_fields) if args.summary_fields is not None: cluster_args.update({"summary_fields": args.summary_fields_}) cluster_args = update_sample_parameters_args(cluster_args, args) if 'cluster' in args.json_args: update_json_args(cluster_args, args.json_args.get('cluster'), fields) return cluster_args
def set_logistic_regression_args(args, name=None, fields=None, objective_id=None, logistic_regression_fields=None): """Return logistic regression arguments dict """ if name is None: name = args.name if logistic_regression_fields is None: logistic_regression_fields = args.logistic_regression_fields_ if objective_id is None: objective_id = args.objective_id_ logistic_regression_args = set_basic_model_args(args, name) logistic_regression_args.update( {"seed": SEED if args.seed is None else args.seed}) if objective_id is not None and fields is not None: logistic_regression_args.update({"objective_field": objective_id}) if logistic_regression_fields and fields is not None: input_fields = configure_input_fields(fields, logistic_regression_fields) logistic_regression_args.update(input_fields=input_fields) if ((args.evaluate and args.test_split == 0 and args.test_datasets is None) or args.cross_validation_rate > 0): logistic_regression_args.update(seed=SEED) if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate args.replacement = False elif (args.sample_rate == 1 and args.test_datasets is None and not args.dataset_off): args.sample_rate = EVALUATE_SAMPLE_RATE logistic_regression_args.update({"sample_rate": args.sample_rate}) if args.lr_c: logistic_regression_args.update({"c": args.lr_c}) logistic_regression_args.update({"bias": args.bias}) logistic_regression_args.update( \ {"balance_fields": args.balance_fields}) if args.eps: logistic_regression_args.update({"eps": args.eps}) if args.normalize is not None: logistic_regression_args.update({"normalize": args.normalize}) if args.missing_numerics is not None: logistic_regression_args.update( \ {"missing_numerics": args.missing_numerics}) if args.field_codings is not None: logistic_regression_args.update(\ {"field_codings": args.field_codings_}) logistic_regression_args = update_sample_parameters_args( \ logistic_regression_args, args) if 'logistic_regression' in args.json_args: update_json_args(logistic_regression_args, args.json_args.get('logistic_regression'), fields) return logistic_regression_args
def set_ensemble_args(args, name=None, objective_id=None, model_fields=None, fields=None): """Return ensemble arguments dict """ if name is None: name = args.name if objective_id is None: objective_id = args.objective_id_ if model_fields is None: model_fields = args.model_fields_ ensemble_args = set_basic_model_args(args, name) ensemble_args.update({ "missing_splits": args.missing_splits, "ensemble_sample": {"seed": SEED if args.ensemble_sample_seed is None \ else args.ensemble_sample_seed}, "seed": SEED if args.seed is None else args.seed }) if objective_id is not None and fields is not None: ensemble_args.update({"objective_field": objective_id}) if args.boosting: boosting_args = {} for option in BOOSTING_OPTIONS: if hasattr(args, option) and getattr(args, option) is not None: boosting_args.update({option: getattr(args, option)}) ensemble_args.update({"boosting": boosting_args}) else: ensemble_args.update({"number_of_models": args.number_of_models}) # If evaluate flag is on and no test_split flag is provided, # we choose a deterministic sampling with # args.sample_rate (80% by default) of the data to create the model if (args.evaluate and args.test_split == 0 and args.test_datasets is None and not args.dataset_off): ensemble_args.update({"seed": SEED}) if args.sample_rate == 1: args.sample_rate = EVALUATE_SAMPLE_RATE if model_fields and fields is not None: input_fields = configure_input_fields(fields, model_fields) ensemble_args.update(input_fields=input_fields) if args.pruning and args.pruning != 'smart': ensemble_args.update(stat_pruning=(args.pruning == 'statistical')) if args.node_threshold > 0: ensemble_args.update(node_threshold=args.node_threshold) if args.balance: ensemble_args.update(balance_objective=True) if args.weight_field: try: weight_field = fields.field_id(args.weight_field) except ValueError, exc: sys.exit(exc) ensemble_args.update(weight_field=weight_field)
def set_topic_model_args(args, name=None, fields=None, topic_model_fields=None): """Return topic_model arguments dict """ if name is None: name = args.name if topic_model_fields is None: topic_model_fields = args.topic_model_fields_ topic_model_args = set_basic_args(args, name) topic_model_args.update({ "seed": SEED if args.seed is None else args.seed, "topicmodel_seed": SEED if args.seed is None else args.seed }) if topic_model_fields and fields is not None: input_fields = configure_input_fields(fields, topic_model_fields) topic_model_args.update(input_fields=input_fields) topic_model_args.update({"sample_rate": args.sample_rate}) topic_model_args.update({"bigrams": args.bigrams}) topic_model_args.update({"case_sensitive": args.case_sensitive}) if args.number_of_topics is not None: topic_model_args.update({"number_of_topics": args.number_of_topics}) if args.term_limit is not None: topic_model_args.update({"term_limit": args.term_limit}) if args.top_n_terms is not None: topic_model_args.update({"top_n_terms": args.top_n_terms}) if args.minimum_name_terms is not None: topic_model_args.update( {"minimum_name_terms": args.minimum_name_terms}) if args.excluded_terms: topic_model_args.update({"excluded_terms": args.excluded_terms_}) topic_model_args = update_sample_parameters_args( \ topic_model_args, args) if 'topic_model' in args.json_args: update_json_args(topic_model_args, args.json_args.get('topic_model'), fields) return topic_model_args
def set_deepnet_args(args, name=None, fields=None, objective_id=None, deepnet_fields=None): """Return deepnet arguments dict """ if name is None: name = args.name if deepnet_fields is None: deepnet_fields = args.deepnet_fields_ if objective_id is None: objective_id = args.objective_id_ deepnet_args = set_basic_model_args(args, name) deepnet_args.update({"seed": SEED if args.seed is None else args.seed}) if objective_id is not None and fields is not None: deepnet_args.update({"objective_field": objective_id}) if deepnet_fields and fields is not None: input_fields = configure_input_fields(fields, deepnet_fields) deepnet_args.update(input_fields=input_fields) if ((args.evaluate and args.test_split == 0 and args.test_datasets is None) or args.cross_validation_rate > 0): deepnet_args.update(seed=SEED) if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate args.replacement = False elif (args.sample_rate == 1 and args.test_datasets is None and not args.dataset_off): args.sample_rate = EVALUATE_SAMPLE_RATE deepnet_args.update({"sample_rate": args.sample_rate}) if args.batch_normalization is not None: deepnet_args.update({"batch_normalization": args.batch_normalization}) if args.dropout_rate: deepnet_args.update({"dropout_rate": args.dropout_rate}) if args.hidden_layers is not None: deepnet_args.update({"hidden_layers": args.hidden_layers_}) if args.learn_residuals is not None: deepnet_args.update( \ {"learn_residuals": args.learn_residuals}) if args.max_iterations is not None: deepnet_args.update(\ {"learning_rate": args.learning_rate}) if args.max_training_time is not None: deepnet_args.update(\ {"max_training_time": args.max_training_time}) if args.number_of_hidden_layers is not None: deepnet_args.update(\ {"number_of_hidden_layers": args.number_of_hidden_layers}) if args.number_of_model_candidates is not None: deepnet_args.update(\ {"number_of_model_candidates": args.number_of_model_candidates}) if args.search is not None: deepnet_args.update(\ {"search": args.search}) if args.suggest_structure is not None: deepnet_args.update(\ {"suggest_structure": args.suggest_structure}) if not args.missing_numerics: deepnet_args.update(\ {"missing_numerics": args.missing_numerics}) if args.tree_embedding: deepnet_args.update(\ {"tree_embedding": args.tree_embedding}) deepnet_args = update_sample_parameters_args( \ deepnet_args, args) if 'deepnet' in args.json_args: update_json_args(deepnet_args, args.json_args.get('deepnet'), fields) return deepnet_args
if hasattr(args, 'sql_query') and args.sql_query: dataset_args.update({"sql_query": args.sql_query}) if hasattr(args, 'sql_output_fields_') and args.sql_output_fields_: dataset_args.update({"sql_output_fields": args.sql_output_fields_}) if hasattr(args, 'json_query_') and args.json_query_: dataset_args.update({"json_query": args.json_query_}) if args.json_filter: dataset_args.update(json_filter=args.json_filter) elif args.lisp_filter: dataset_args.update(lisp_filter=args.lisp_filter) if args.dataset_fields_ and fields is not None: input_fields = configure_input_fields(fields, args.dataset_fields_) dataset_args.update(input_fields=input_fields) if (hasattr(args, 'multi_label') and args.multi_label and multi_label_data is not None): dataset_args.update( user_metadata={'multi_label_data': multi_label_data}) if fields and args.import_fields: fields_struct = fields.new_fields_structure(args.import_fields) check_fields_struct(fields_struct, "dataset") update_attributes(dataset_args, fields_struct) if 'dataset' in args.json_args: update_json_args(dataset_args, args.json_args.get('dataset'), fields) return dataset_args