def set_pca_args(args, name=None, fields=None, pca_fields=None): """Return pca arguments dict """ if name is None: name = args.name if pca_fields is None: pca_fields = args.pca_fields_ pca_args = set_basic_args(args, name) pca_args.update({ "seed": SEED if args.seed is None else args.seed, "pca_seed": SEED if args.seed is None else args.seed }) pca_args.update({"sample_rate": args.sample_rate}) pca_args = update_sample_parameters_args( \ pca_args, args) if fields is not None: input_fields = fields.fields.keys() if pca_fields and fields is not None: input_fields = configure_input_fields(fields, pca_fields) if args.exclude_objective: input_fields = [field for field in input_fields \ if field not in args.exclude_fields] pca_args.update(input_fields=input_fields) if 'pca' in args.json_args: update_json_args(pca_args, args.json_args.get('pca'), fields) return pca_args
def set_external_connector_args(args, name=None): """Return external connector arguments dict """ if name is None: name = args.name external_connector_args = set_basic_args(args, name) source = "postgresql" if args.source is None else args.source external_connector_args.update({"source": source}) connection_keys = EXTERNAL_CONNECTION_ATTRS.values() connection_keys.remove("source") connection_info = {} for key in connection_keys: if hasattr(args, key) and getattr(args, key): connection_info.update({key: getattr(args, key)}) if not connection_info: # try to read environment variables connection_info = get_env_connection_info() args.connection_info = connection_info if args.hosts: args.connection_info.update({"hosts": args.hosts.split(",")}) # rare arguments must be provided in a JSON file if args.connection_json_: args.connection_info.update(args.connection_json_) if 'external_connector' in args.json_args: update_json_args(external_connector_args, args.json_args.get('external_connector'), None) return external_connector_args
def set_evaluation_args(args, fields=None, dataset_fields=None, name=None): """Return evaluation args dict """ if name is None: name = args.name evaluation_args = set_basic_args(args, name) if hasattr(args, 'method') and (args.number_of_models > 1 or args.ensemble): evaluation_args.update(combiner=args.method) if hasattr(args, 'method') and args.method: evaluation_args.update({"combiner": args.method}) if args.method == THRESHOLD_CODE: threshold = {} if hasattr(args, 'threshold') and args.threshold is not None: threshold.update(k=args.threshold) if hasattr(args, 'threshold_class') \ and args.threshold_class is not None: threshold.update({"class": args.threshold_class}) evaluation_args.update(threshold=threshold) if args.fields_map_ and fields is not None: if dataset_fields is None: dataset_fields = fields evaluation_args.update({"fields_map": map_fields(args.fields_map_, fields, dataset_fields)}) if hasattr(args, 'missing_strategy') and args.missing_strategy: evaluation_args.update(missing_strategy=args.missing_strategy) if 'evaluation' in args.json_args: update_json_args( evaluation_args, args.json_args.get('evaluation'), fields) # if evaluating time series we need to use ranges if args.subcommand == "time-series" and args.test_split == 0 and \ not args.has_test_datasets_: args.range_ = [int(args.max_rows * EVALUATE_SAMPLE_RATE) + 1, args.max_rows] evaluation_args.update({"range": args.range_}) return evaluation_args # Two cases to use out_of_bag and sample_rate: standard evaluations where # only the training set is provided, and cross_validation # [--dataset|--test] [--model|--models|--model-tag|--ensemble] --evaluate if (((hasattr(args, "dataset") and args.dataset) or args.test_set) and args.has_supervised_): return evaluation_args # [--train|--dataset] --test-split --evaluate if args.test_split > 0 and (args.training_set or args.dataset): return evaluation_args # --datasets --test-datasets or equivalents #if args.datasets and (args.test_datasets or args.dataset_off): if args.has_datasets_ and (args.has_test_datasets_ or args.dataset_off): return evaluation_args if args.sample_rate == 1: args.sample_rate = EVALUATE_SAMPLE_RATE evaluation_args.update(out_of_bag=True, seed=SEED, sample_rate=args.sample_rate) return evaluation_args
def set_project_args(args, name=None): """Return project arguments dict """ if name is None: name = args.name project_args = set_basic_args(args, name) if 'project' in args.json_args: update_json_args(project_args, args.json_args.get('project'), None) return project_args
def set_fusion_args(args, name=None, fields=None): """Return fusion arguments dict """ if name is None: name = args.name fusion_args = set_basic_args(args, name) if 'fusion' in args.json_args: update_json_args(fusion_args, args.json_args.get('fusion'), fields) return fusion_args
def set_sample_args(args, name=None): """Return sample arguments dict """ if name is None: name = args.name sample_args = set_basic_args(args, name) if 'sample' in args.json_args: update_json_args(sample_args, args.json_args.get('sample')) return sample_args
def set_forecast_args(args, fields=None): """Return forecast dict """ forecast_args = set_basic_args(args, args.name) forecast_args.update({ "intervals": args.intervals, }) if 'forecast' in args.json_args: update_json_args(forecast_args, args.json_args.get('forecast'), fields) return forecast_args
def set_library_args(args, name=None): """Returns a library arguments dict """ if name is None: name = args.name library_args = set_basic_args(args, name) if args.project_id is not None: library_args.update({"project": args.project_id}) if args.imports is not None: library_args.update({"imports": args.imports_}) update_attributes(library_args, args.json_args.get('library')) return library_args
def set_basic_dataset_args(args, name=None): """Return dataset basic arguments dict """ if name is None: name = args.name dataset_args = set_basic_args(args, name) if args.sample_rate != 1 and args.no_model: dataset_args.update({ "seed": SEED if args.seed is None else args.seed, "sample_rate": args.sample_rate }) if hasattr(args, "range") and args.range_: dataset_args.update({"range": args.range_}) return dataset_args
def set_topic_model_args(args, name=None, fields=None, topic_model_fields=None): """Return topic_model arguments dict """ if name is None: name = args.name if topic_model_fields is None: topic_model_fields = args.topic_model_fields_ topic_model_args = set_basic_args(args, name) topic_model_args.update({ "seed": SEED if args.seed is None else args.seed, "topicmodel_seed": SEED if args.seed is None else args.seed }) if topic_model_fields and fields is not None: input_fields = configure_input_fields(fields, topic_model_fields) topic_model_args.update(input_fields=input_fields) topic_model_args.update({"sample_rate": args.sample_rate}) topic_model_args.update({"bigrams": args.bigrams}) topic_model_args.update({"case_sensitive": args.case_sensitive}) if args.number_of_topics is not None: topic_model_args.update({"number_of_topics": args.number_of_topics}) if args.term_limit is not None: topic_model_args.update({"term_limit": args.term_limit}) if args.top_n_terms is not None: topic_model_args.update({"top_n_terms": args.top_n_terms}) if args.minimum_name_terms is not None: topic_model_args.update( {"minimum_name_terms": args.minimum_name_terms}) if args.excluded_terms: topic_model_args.update({"excluded_terms": args.excluded_terms_}) topic_model_args = update_sample_parameters_args( \ topic_model_args, args) if 'topic_model' in args.json_args: update_json_args(topic_model_args, args.json_args.get('topic_model'), fields) return topic_model_args
def set_script_args(args, name=None): """Returns a script arguments dict """ if name is None: name = args.name script_args = set_basic_args(args, name) if args.project_id is not None: script_args.update({"project": args.project_id}) if args.imports is not None: script_args.update({"imports": args.imports_}) if args.parameters_ is not None: script_args.update({"inputs": args.parameters_}) if args.declare_outputs_: script_args.update({"outputs": args.declare_outputs_}) update_attributes(script_args, args.json_args.get('script')) return script_args
def set_execution_args(args, name=None): """Returns an execution arguments dict """ if name is None: name = args.name execution_args = set_basic_args(args, name) if args.project_id is not None: execution_args.update({"project": args.project_id}) if args.arguments_: execution_args.update({"inputs": args.arguments_}) if args.creation_defaults is not None: execution_args.update({"creation_defaults": args.creation_defaults_}) if args.outputs_: execution_args.update({"outputs": args.outputs_}) if args.input_maps_: execution_args.update({"input_maps_": args.input_maps_}) update_attributes(execution_args, args.json_args.get('execution')) return execution_args
def set_source_args(args, name=None, multi_label_data=None, data_set_header=None, fields=None): """Returns a source arguments dict """ if name is None: name = args.name source_args = set_basic_args(args, name) if args.project_id is not None: source_args.update({"project": args.project_id}) # if header is set, use it if data_set_header is not None: source_args.update({"source_parser": {"header": data_set_header}}) # If user has given an OS locale, try to add the locale used in bigml.com if args.user_locale is not None: source_locale = bigml_locale(args.user_locale) if source_locale is None: log_message("WARNING: %s locale equivalence not found." " Using %s instead.\n" % (args.user_locale, LOCALE_DEFAULT), log_file=None, console=True) source_locale = LOCALE_DEFAULT source_args.update({'source_parser': {}}) source_args["source_parser"].update({'locale': source_locale}) # If user has set a training separator, use it. if args.training_separator is not None: training_separator = decode2(args.training_separator, encoding="string_escape") source_args["source_parser"].update({'separator': training_separator}) # If uploading a multi-label file, add the user_metadata info needed to # manage the multi-label fields if (hasattr(args, 'multi_label') and args.multi_label and multi_label_data is not None): source_args.update( {"user_metadata": { "multi_label_data": multi_label_data }}) # to update fields attributes or types you must have a previous fields # structure (at update time) if fields: if args.field_attributes_: update_attributes(source_args, {"fields": args.field_attributes_}, by_column=True, fields=fields) if args.types_: update_attributes(source_args, {"fields": args.types_}, by_column=True, fields=fields) if args.import_fields: fields_struct = fields.new_fields_structure(args.import_fields) check_fields_struct(fields_struct, "source") update_attributes(source_args, fields_struct) if 'source' in args.json_args: update_json_args(source_args, args.json_args.get('source'), fields) return source_args