def set_association_args(args, name=None, fields=None, association_fields=None): """Return association arguments dict """ if name is None: name = args.name if association_fields is None: association_fields = args.association_fields_ association_args = set_basic_model_args(args, name) if association_fields and fields is not None: input_fields = configure_input_fields(fields, association_fields) association_args.update(input_fields=input_fields) if args.association_k: association_args.update({"max_k": args.association_k}) if args.search_strategy: association_args.update({"search_strategy": args.search_strategy}) association_args = update_sample_parameters_args(association_args, args) if 'association' in args.json_args: update_json_args(association_args, args.json_args.get('association'), fields) return association_args
def set_anomaly_args(args, name=None, fields=None, anomaly_fields=None): """Return anomaly arguments dict """ if name is None: name = args.name if anomaly_fields is None: anomaly_fields = args.anomaly_fields_ anomaly_args = set_basic_model_args(args, name) anomaly_args.update({ "seed": SEED if args.seed is None else args.seed, "anomaly_seed": (SEED if args.anomaly_seed is None else args.anomaly_seed) }) if anomaly_fields and fields is not None: input_fields = configure_input_fields(fields, anomaly_fields) anomaly_args.update(input_fields=input_fields) if args.top_n > 0: anomaly_args.update(top_n=args.top_n) if args.forest_size > 0: anomaly_args.update(forest_size=args.forest_size) anomaly_args = update_sample_parameters_args(anomaly_args, args) if 'anomaly' in args.json_args: update_json_args(anomaly_args, args.json_args.get('anomaly'), fields) return anomaly_args
def set_cluster_args(args, name=None, fields=None, cluster_fields=None): """Return cluster arguments dict """ if name is None: name = args.name if cluster_fields is None: cluster_fields = args.cluster_fields_ cluster_args = set_basic_model_args(args, name) cluster_args.update({ "seed": SEED if args.seed is None else args.seed, "cluster_seed": (SEED if args.cluster_seed is None else args.cluster_seed) }) if args.cluster_models is not None: cluster_args.update({"model_clusters": True}) if args.cluster_k: cluster_args.update({"k": args.cluster_k}) if cluster_fields and fields is not None: input_fields = configure_input_fields(fields, cluster_fields) cluster_args.update(input_fields=input_fields) if args.summary_fields is not None: cluster_args.update({"summary_fields": args.summary_fields_}) cluster_args = update_sample_parameters_args(cluster_args, args) if 'cluster' in args.json_args: update_json_args(cluster_args, args.json_args.get('cluster'), fields) return cluster_args
def set_external_connector_args(args, name=None): """Return external connector arguments dict """ if name is None: name = args.name external_connector_args = set_basic_args(args, name) source = "postgresql" if args.source is None else args.source external_connector_args.update({"source": source}) connection_keys = EXTERNAL_CONNECTION_ATTRS.values() connection_keys.remove("source") connection_info = {} for key in connection_keys: if hasattr(args, key) and getattr(args, key): connection_info.update({key: getattr(args, key)}) if not connection_info: # try to read environment variables connection_info = get_env_connection_info() args.connection_info = connection_info if args.hosts: args.connection_info.update({"hosts": args.hosts.split(",")}) # rare arguments must be provided in a JSON file if args.connection_json_: args.connection_info.update(args.connection_json_) if 'external_connector' in args.json_args: update_json_args(external_connector_args, args.json_args.get('external_connector'), None) return external_connector_args
def set_pca_args(args, name=None, fields=None, pca_fields=None): """Return pca arguments dict """ if name is None: name = args.name if pca_fields is None: pca_fields = args.pca_fields_ pca_args = set_basic_args(args, name) pca_args.update({ "seed": SEED if args.seed is None else args.seed, "pca_seed": SEED if args.seed is None else args.seed }) pca_args.update({"sample_rate": args.sample_rate}) pca_args = update_sample_parameters_args( \ pca_args, args) if fields is not None: input_fields = fields.fields.keys() if pca_fields and fields is not None: input_fields = configure_input_fields(fields, pca_fields) if args.exclude_objective: input_fields = [field for field in input_fields \ if field not in args.exclude_fields] pca_args.update(input_fields=input_fields) if 'pca' in args.json_args: update_json_args(pca_args, args.json_args.get('pca'), fields) return pca_args
def set_logistic_regression_args(args, name=None, fields=None, objective_id=None, logistic_regression_fields=None): """Return logistic regression arguments dict """ if name is None: name = args.name if logistic_regression_fields is None: logistic_regression_fields = args.logistic_regression_fields_ if objective_id is None: objective_id = args.objective_id_ logistic_regression_args = set_basic_model_args(args, name) logistic_regression_args.update( {"seed": SEED if args.seed is None else args.seed}) if objective_id is not None and fields is not None: logistic_regression_args.update({"objective_field": objective_id}) if logistic_regression_fields and fields is not None: input_fields = configure_input_fields(fields, logistic_regression_fields) logistic_regression_args.update(input_fields=input_fields) if ((args.evaluate and args.test_split == 0 and args.test_datasets is None) or args.cross_validation_rate > 0): logistic_regression_args.update(seed=SEED) if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate args.replacement = False elif (args.sample_rate == 1 and args.test_datasets is None and not args.dataset_off): args.sample_rate = EVALUATE_SAMPLE_RATE logistic_regression_args.update({"sample_rate": args.sample_rate}) if args.lr_c: logistic_regression_args.update({"c": args.lr_c}) logistic_regression_args.update({"bias": args.bias}) logistic_regression_args.update( \ {"balance_fields": args.balance_fields}) if args.eps: logistic_regression_args.update({"eps": args.eps}) if args.normalize is not None: logistic_regression_args.update({"normalize": args.normalize}) if args.missing_numerics is not None: logistic_regression_args.update( \ {"missing_numerics": args.missing_numerics}) if args.field_codings is not None: logistic_regression_args.update(\ {"field_codings": args.field_codings_}) logistic_regression_args = update_sample_parameters_args( \ logistic_regression_args, args) if 'logistic_regression' in args.json_args: update_json_args(logistic_regression_args, args.json_args.get('logistic_regression'), fields) return logistic_regression_args
def set_evaluation_args(args, fields=None, dataset_fields=None, name=None): """Return evaluation args dict """ if name is None: name = args.name evaluation_args = set_basic_args(args, name) if hasattr(args, 'method') and (args.number_of_models > 1 or args.ensemble): evaluation_args.update(combiner=args.method) if hasattr(args, 'method') and args.method: evaluation_args.update({"combiner": args.method}) if args.method == THRESHOLD_CODE: threshold = {} if hasattr(args, 'threshold') and args.threshold is not None: threshold.update(k=args.threshold) if hasattr(args, 'threshold_class') \ and args.threshold_class is not None: threshold.update({"class": args.threshold_class}) evaluation_args.update(threshold=threshold) if args.fields_map_ and fields is not None: if dataset_fields is None: dataset_fields = fields evaluation_args.update({"fields_map": map_fields(args.fields_map_, fields, dataset_fields)}) if hasattr(args, 'missing_strategy') and args.missing_strategy: evaluation_args.update(missing_strategy=args.missing_strategy) if 'evaluation' in args.json_args: update_json_args( evaluation_args, args.json_args.get('evaluation'), fields) # if evaluating time series we need to use ranges if args.subcommand == "time-series" and args.test_split == 0 and \ not args.has_test_datasets_: args.range_ = [int(args.max_rows * EVALUATE_SAMPLE_RATE) + 1, args.max_rows] evaluation_args.update({"range": args.range_}) return evaluation_args # Two cases to use out_of_bag and sample_rate: standard evaluations where # only the training set is provided, and cross_validation # [--dataset|--test] [--model|--models|--model-tag|--ensemble] --evaluate if (((hasattr(args, "dataset") and args.dataset) or args.test_set) and args.has_supervised_): return evaluation_args # [--train|--dataset] --test-split --evaluate if args.test_split > 0 and (args.training_set or args.dataset): return evaluation_args # --datasets --test-datasets or equivalents #if args.datasets and (args.test_datasets or args.dataset_off): if args.has_datasets_ and (args.has_test_datasets_ or args.dataset_off): return evaluation_args if args.sample_rate == 1: args.sample_rate = EVALUATE_SAMPLE_RATE evaluation_args.update(out_of_bag=True, seed=SEED, sample_rate=args.sample_rate) return evaluation_args
def set_project_args(args, name=None): """Return project arguments dict """ if name is None: name = args.name project_args = set_basic_args(args, name) if 'project' in args.json_args: update_json_args(project_args, args.json_args.get('project'), None) return project_args
def set_fusion_args(args, name=None, fields=None): """Return fusion arguments dict """ if name is None: name = args.name fusion_args = set_basic_args(args, name) if 'fusion' in args.json_args: update_json_args(fusion_args, args.json_args.get('fusion'), fields) return fusion_args
def set_sample_args(args, name=None): """Return sample arguments dict """ if name is None: name = args.name sample_args = set_basic_args(args, name) if 'sample' in args.json_args: update_json_args(sample_args, args.json_args.get('sample')) return sample_args
def set_time_series_args(args, name=None, fields=None, objective_id=None): """Return time-series arguments dict """ if name is None: name = args.name if objective_id is None: objective_id = args.objective_id_ time_series_args = set_basic_model_args(args, name) time_series_args.update({ "all_numeric_objectives": args.all_numeric_objectives, "period": args.period }) # if we need to evaluate and there's no previous split, use a range if args.evaluate and args.test_split == 0 and not args.has_test_datasets_: args.range_ = [1, int(args.max_rows * EVALUATE_SAMPLE_RATE)] if objective_id is not None: time_series_args.update({"objective_field": objective_id}) if args.objectives: time_series_args.update({"objective_fields": args.objective_fields_}) if args.damped_trend is not None: time_series_args.update({"damped_trend": args.damped_trend}) if args.error is not None: time_series_args.update({"error": args.error}) if args.field_parameters: time_series_args.update({"field_parameters": args.field_parameters_}) if args.range_: time_series_args.update({"range": args.range_}) if args.seasonality is not None: time_series_args.update({"seasonality": args.seasonality}) if args.trend is not None: time_series_args.update({"trend": args.trend}) if args.time_start or args.time_end or args.time_interval or \ args.time_interval_unit: time_range = {} if args.time_start: time_range.update({"start": args.time_start}) if args.time_end: time_range.update({"end": args.time_end}) if args.time_interval: time_range.update({"interval": args.time_interval}) if args.time_interval_unit: time_range.update({"interval_unit": args.time_interval_unit}) time_series_args.update({"time_range": time_range}) if 'time_series' in args.json_args: update_json_args(time_series_args, args.json_args.get('time_series'), fields) return time_series_args
def set_forecast_args(args, fields=None): """Return forecast dict """ forecast_args = set_basic_args(args, args.name) forecast_args.update({ "intervals": args.intervals, }) if 'forecast' in args.json_args: update_json_args(forecast_args, args.json_args.get('forecast'), fields) return forecast_args
def set_topic_model_args(args, name=None, fields=None, topic_model_fields=None): """Return topic_model arguments dict """ if name is None: name = args.name if topic_model_fields is None: topic_model_fields = args.topic_model_fields_ topic_model_args = set_basic_args(args, name) topic_model_args.update({ "seed": SEED if args.seed is None else args.seed, "topicmodel_seed": SEED if args.seed is None else args.seed }) if topic_model_fields and fields is not None: input_fields = configure_input_fields(fields, topic_model_fields) topic_model_args.update(input_fields=input_fields) topic_model_args.update({"sample_rate": args.sample_rate}) topic_model_args.update({"bigrams": args.bigrams}) topic_model_args.update({"case_sensitive": args.case_sensitive}) if args.number_of_topics is not None: topic_model_args.update({"number_of_topics": args.number_of_topics}) if args.term_limit is not None: topic_model_args.update({"term_limit": args.term_limit}) if args.top_n_terms is not None: topic_model_args.update({"top_n_terms": args.top_n_terms}) if args.minimum_name_terms is not None: topic_model_args.update( {"minimum_name_terms": args.minimum_name_terms}) if args.excluded_terms: topic_model_args.update({"excluded_terms": args.excluded_terms_}) topic_model_args = update_sample_parameters_args( \ topic_model_args, args) if 'topic_model' in args.json_args: update_json_args(topic_model_args, args.json_args.get('topic_model'), fields) return topic_model_args
batch_projection_args.update(all_fields=True) projection_fields = [] if args.projection_fields != "all": batch_projection_args.update(all_fields=True) for field in args.projection_fields.split(args.args_separator): field = field.strip() if not field in dataset_fields.fields: try: field = dataset_fields.field_id(field) except ValueError, exc: sys.exit(exc) projection_fields.append(field) batch_projection_args.update(output_fields=projection_fields) if 'batch_projection' in args.json_args: update_json_args( batch_projection_args, args.json_args.get( \ 'batch_projection'), fields) return batch_projection_args def create_batch_projection(pca, test_dataset, batch_projection_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch projection
ensemble_args.update(weight_field=weight_field) if args.objective_weights: ensemble_args.update(objective_weights=args.objective_weights_json) if args.random_candidates: ensemble_args.update(random_candidates=args.random_candidates) update_attributes(ensemble_args, args.json_args.get('model')) ensemble_args = update_sample_parameters_args(ensemble_args, args) ensemble_args["ensemble_sample"].update( \ {"rate": args.ensemble_sample_rate, "replacement": args.ensemble_sample_replacement}) if 'ensemble' in args.json_args: update_json_args(ensemble_args, args.json_args.get('ensemble'), fields) return ensemble_args def create_ensembles(datasets, ensemble_ids, ensemble_args, args, number_of_ensembles=1, api=None, path=None, session_file=None, log=None): """Create ensembles from input data """ if api is None: api = bigml.api.BigML() ensembles = ensemble_ids[:] existing_ensembles = len(ensembles)
batch_anomaly_score_args.update(all_fields=True) if args.prediction_fields: batch_anomaly_score_args.update(all_fields=False) prediction_fields = [] for field in args.prediction_fields.split(args.args_separator): field = field.strip() if not field in dataset_fields.fields: try: field = dataset_fields.field_id(field) except ValueError, exc: sys.exit(exc) prediction_fields.append(field) batch_anomaly_score_args.update(output_fields=prediction_fields) if 'batch_anomaly_score' in args.json_args: update_json_args(batch_anomaly_score_args, args.json_args.get('batch_anomaly_score'), fields) return batch_anomaly_score_args def create_batch_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch anomaly score """
def set_source_args(args, name=None, multi_label_data=None, data_set_header=None, fields=None): """Returns a source arguments dict """ if name is None: name = args.name source_args = set_basic_args(args, name) if args.project_id is not None: source_args.update({"project": args.project_id}) # if header is set, use it if data_set_header is not None: source_args.update({"source_parser": {"header": data_set_header}}) # If user has given an OS locale, try to add the locale used in bigml.com if args.user_locale is not None: source_locale = bigml_locale(args.user_locale) if source_locale is None: log_message("WARNING: %s locale equivalence not found." " Using %s instead.\n" % (args.user_locale, LOCALE_DEFAULT), log_file=None, console=True) source_locale = LOCALE_DEFAULT source_args.update({'source_parser': {}}) source_args["source_parser"].update({'locale': source_locale}) # If user has set a training separator, use it. if args.training_separator is not None: training_separator = decode2(args.training_separator, encoding="string_escape") source_args["source_parser"].update({'separator': training_separator}) # If uploading a multi-label file, add the user_metadata info needed to # manage the multi-label fields if (hasattr(args, 'multi_label') and args.multi_label and multi_label_data is not None): source_args.update( {"user_metadata": { "multi_label_data": multi_label_data }}) # to update fields attributes or types you must have a previous fields # structure (at update time) if fields: if args.field_attributes_: update_attributes(source_args, {"fields": args.field_attributes_}, by_column=True, fields=fields) if args.types_: update_attributes(source_args, {"fields": args.types_}, by_column=True, fields=fields) if args.import_fields: fields_struct = fields.new_fields_structure(args.import_fields) check_fields_struct(fields_struct, "source") update_attributes(source_args, fields_struct) if 'source' in args.json_args: update_json_args(source_args, args.json_args.get('source'), fields) return source_args
if args.prediction_info == FULL_FORMAT: batch_centroid_args.update(all_fields=True) if args.prediction_fields: batch_centroid_args.update(all_fields=False) prediction_fields = [] for field in args.prediction_fields.split(args.args_separator): field = field.strip() if not field in dataset_fields.fields: try: field = dataset_fields.field_id(field) except ValueError, exc: sys.exit(exc) prediction_fields.append(field) batch_centroid_args.update(output_fields=prediction_fields) if 'batch_centroid' in args.json_args: update_json_args( batch_centroid_args, args.json_args.get('batch_centroid'), fields) return batch_centroid_args def create_batch_centroid(cluster, test_dataset, batch_centroid_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch_centroid """ if api is None: api = bigml.api.BigML() message = dated("Creating batch centroid.\n") log_message(message, log_file=session_file, console=args.verbosity)
def set_deepnet_args(args, name=None, fields=None, objective_id=None, deepnet_fields=None): """Return deepnet arguments dict """ if name is None: name = args.name if deepnet_fields is None: deepnet_fields = args.deepnet_fields_ if objective_id is None: objective_id = args.objective_id_ deepnet_args = set_basic_model_args(args, name) deepnet_args.update({"seed": SEED if args.seed is None else args.seed}) if objective_id is not None and fields is not None: deepnet_args.update({"objective_field": objective_id}) if deepnet_fields and fields is not None: input_fields = configure_input_fields(fields, deepnet_fields) deepnet_args.update(input_fields=input_fields) if ((args.evaluate and args.test_split == 0 and args.test_datasets is None) or args.cross_validation_rate > 0): deepnet_args.update(seed=SEED) if args.cross_validation_rate > 0: args.sample_rate = 1 - args.cross_validation_rate args.replacement = False elif (args.sample_rate == 1 and args.test_datasets is None and not args.dataset_off): args.sample_rate = EVALUATE_SAMPLE_RATE deepnet_args.update({"sample_rate": args.sample_rate}) if args.batch_normalization is not None: deepnet_args.update({"batch_normalization": args.batch_normalization}) if args.dropout_rate: deepnet_args.update({"dropout_rate": args.dropout_rate}) if args.hidden_layers is not None: deepnet_args.update({"hidden_layers": args.hidden_layers_}) if args.learn_residuals is not None: deepnet_args.update( \ {"learn_residuals": args.learn_residuals}) if args.max_iterations is not None: deepnet_args.update(\ {"learning_rate": args.learning_rate}) if args.max_training_time is not None: deepnet_args.update(\ {"max_training_time": args.max_training_time}) if args.number_of_hidden_layers is not None: deepnet_args.update(\ {"number_of_hidden_layers": args.number_of_hidden_layers}) if args.number_of_model_candidates is not None: deepnet_args.update(\ {"number_of_model_candidates": args.number_of_model_candidates}) if args.search is not None: deepnet_args.update(\ {"search": args.search}) if args.suggest_structure is not None: deepnet_args.update(\ {"suggest_structure": args.suggest_structure}) if not args.missing_numerics: deepnet_args.update(\ {"missing_numerics": args.missing_numerics}) if args.tree_embedding: deepnet_args.update(\ {"tree_embedding": args.tree_embedding}) deepnet_args = update_sample_parameters_args( \ deepnet_args, args) if 'deepnet' in args.json_args: update_json_args(deepnet_args, args.json_args.get('deepnet'), fields) return deepnet_args
batch_topic_distribution_args.update(all_fields=True) if args.prediction_fields: batch_topic_distribution_args.update(all_fields=False) prediction_fields = [] for field in args.prediction_fields.split(args.args_separator): field = field.strip() if not field in dataset_fields.fields: try: field = dataset_fields.field_id(field) except Exception, exc: sys.exit(exc) prediction_fields.append(field) batch_topic_distribution_args.update(output_fields=prediction_fields) if 'batch_topic_distribution' in args.json_args: update_json_args( batch_topic_distribution_args, args.json_args.get( \ 'batch_topic_distribution'), fields) return batch_topic_distribution_args def create_batch_topic_distribution(topic_model, test_dataset, batch_topic_distribution_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch topic distribution
except ValueError, exc: sys.exit(exc) model_args.update(weight_field=weight_field) if args.objective_weights: model_args.update(objective_weights=args.objective_weights_json) if args.max_categories > 0: model_args.update( user_metadata={'other_label': other_label, 'max_categories': args.max_categories}) model_args = update_sample_parameters_args(model_args, args) if 'model' in args.json_args: update_json_args(model_args, args.json_args.get('model'), fields) return model_args def set_label_model_args(args, fields, labels, multi_label_data): """Set of args needed to build a model per label """ objective_field = args.objective_field if not args.model_fields_: model_fields = [] else: model_fields = relative_input_fields(fields, args.model_fields_) if objective_field is None:
except ValueError, exc: sys.exit(exc) prediction_fields.append(field) batch_prediction_args.update(output_fields=prediction_fields) if hasattr(args, 'missing_strategy') and args.missing_strategy: batch_prediction_args.update(missing_strategy=args.missing_strategy) if hasattr(args, "operating_point_") and args.operating_point_: batch_prediction_args.update(operating_point=args.operating_point_) if args.operating_point_.get("kind") == "probability": batch_prediction_args.update({ "probability": True, "confidence": False }) if 'batch_prediction' in args.json_args: update_json_args(batch_prediction_args, args.json_args.get('batch_prediction'), fields) return batch_prediction_args def create_batch_prediction(model_or_ensemble, test_dataset, batch_prediction_args, args, api=None, session_file=None, path=None, log=None): """Creates remote batch_prediction """ if api is None:
dataset_args.update(lisp_filter=args.lisp_filter) if args.dataset_fields_ and fields is not None: input_fields = configure_input_fields(fields, args.dataset_fields_) dataset_args.update(input_fields=input_fields) if (hasattr(args, 'multi_label') and args.multi_label and multi_label_data is not None): dataset_args.update( user_metadata={'multi_label_data': multi_label_data}) if fields and args.import_fields: fields_struct = fields.new_fields_structure(args.import_fields) check_fields_struct(fields_struct, "dataset") update_attributes(dataset_args, fields_struct) if 'dataset' in args.json_args: update_json_args(dataset_args, args.json_args.get('dataset'), fields) return dataset_args def set_dataset_split_args(name, description, args, sample_rate=1, out_of_bag=False, multi_label_data=None): """Return dataset arguments dict to split a dataset """ dataset_args = { "name": name,