Example #1
0
def set_anomaly_args(args, name=None, fields=None, anomaly_fields=None):
    """Return anomaly arguments dict

    """
    if name is None:
        name = args.name
    if anomaly_fields is None:
        anomaly_fields = args.anomaly_fields_

    anomaly_args = set_basic_model_args(args, name)
    anomaly_args.update({
        "seed":
        SEED if args.seed is None else args.seed,
        "anomaly_seed":
        (SEED if args.anomaly_seed is None else args.anomaly_seed)
    })

    if anomaly_fields and fields is not None:
        input_fields = configure_input_fields(fields, anomaly_fields)
        anomaly_args.update(input_fields=input_fields)

    if args.top_n > 0:
        anomaly_args.update(top_n=args.top_n)
    if args.forest_size > 0:
        anomaly_args.update(forest_size=args.forest_size)

    anomaly_args = update_sample_parameters_args(anomaly_args, args)

    if 'anomaly' in args.json_args:
        update_json_args(anomaly_args, args.json_args.get('anomaly'), fields)

    return anomaly_args
Example #2
0
def set_association_args(args, name=None, fields=None,
                         association_fields=None):
    """Return association arguments dict

    """
    if name is None:
        name = args.name
    if association_fields is None:
        association_fields = args.association_fields_

    association_args = set_basic_model_args(args, name)

    if association_fields and fields is not None:
        input_fields = configure_input_fields(fields, association_fields)
        association_args.update(input_fields=input_fields)
    if args.association_k:
        association_args.update({"max_k": args.association_k})
    if args.search_strategy:
        association_args.update({"search_strategy": args.search_strategy})

    association_args = update_sample_parameters_args(association_args, args)

    if 'association' in args.json_args:
        update_json_args(association_args,
                         args.json_args.get('association'), fields)

    return association_args
Example #3
0
def set_pca_args(args, name=None, fields=None, pca_fields=None):
    """Return pca arguments dict

    """
    if name is None:
        name = args.name
    if pca_fields is None:
        pca_fields = args.pca_fields_

    pca_args = set_basic_args(args, name)
    pca_args.update({
        "seed": SEED if args.seed is None else args.seed,
        "pca_seed": SEED if args.seed is None else args.seed
    })

    pca_args.update({"sample_rate": args.sample_rate})
    pca_args = update_sample_parameters_args( \
        pca_args, args)
    if fields is not None:
        input_fields = fields.fields.keys()
    if pca_fields and fields is not None:
        input_fields = configure_input_fields(fields, pca_fields)
    if args.exclude_objective:
        input_fields = [field for field in input_fields \
            if field not in args.exclude_fields]
    pca_args.update(input_fields=input_fields)

    if 'pca' in args.json_args:
        update_json_args(pca_args, args.json_args.get('pca'), fields)
    return pca_args
Example #4
0
def set_model_args(args, name=None, objective_id=None, fields=None,
                   model_fields=None, other_label=None):
    """Return model arguments dict

    """
    if name is None:
        name = args.name
    if objective_id is None and args.max_categories is None:
        objective_id = args.objective_id_
    if args.max_categories > 0:
        objective_id = args.objective_field
    if model_fields is None:
        model_fields = args.model_fields_

    model_args = set_basic_model_args(args, name)
    model_args.update({"missing_splits": args.missing_splits})
    if objective_id is not None and fields is not None:
        model_args.update({"objective_field": objective_id})

    # If evaluate flag is on and no test_split flag is provided,
    # we choose a deterministic sampling with
    # args.sample_rate (80% by default) of the data to create the model
    # If cross_validation_rate = n/100, then we choose to run 2 * n evaluations
    # by holding out a n% of randomly sampled data.

    if ((args.evaluate and args.test_split == 0 and args.test_datasets is None)
            or args.cross_validation_rate > 0):
        model_args.update(seed=SEED)
        if args.cross_validation_rate > 0:
            args.sample_rate = 1 - args.cross_validation_rate
            args.replacement = False
        elif (args.sample_rate == 1 and args.test_datasets is None
              and not args.dataset_off):
            args.sample_rate = EVALUATE_SAMPLE_RATE
    if model_fields and fields is not None:
        input_fields = configure_input_fields(
            fields, model_fields, by_name=(args.max_categories > 0))
        model_args.update(input_fields=input_fields)

    if args.pruning and args.pruning != 'smart':
        model_args.update(stat_pruning=(args.pruning == 'statistical'))

    if args.node_threshold > 0:
        model_args.update(node_threshold=args.node_threshold)

    if args.balance:
        model_args.update(balance_objective=True)

    if args.split_field:
        model_args.update(split_field=args.split_field)

    if args.focus_field:
        model_args.update(focus_field=args.focus_field)

    if args.weight_field:
        try:
            weight_field = fields.field_id(args.weight_field)
        except ValueError, exc:
            sys.exit(exc)
        model_args.update(weight_field=weight_field)
Example #5
0
def set_cluster_args(args, name=None, fields=None,
                     cluster_fields=None):
    """Return cluster arguments dict

    """
    if name is None:
        name = args.name
    if cluster_fields is None:
        cluster_fields = args.cluster_fields_

    cluster_args = set_basic_model_args(args, name)
    cluster_args.update({
        "seed": SEED if args.seed is None else args.seed,
        "cluster_seed": (SEED if args.cluster_seed is None
                         else args.cluster_seed)
    })

    if args.cluster_models is not None:
        cluster_args.update({"model_clusters": True})
    if args.cluster_k:
        cluster_args.update({"k": args.cluster_k})
    if cluster_fields and fields is not None:
        input_fields = configure_input_fields(fields, cluster_fields)
        cluster_args.update(input_fields=input_fields)
    if args.summary_fields is not None:
        cluster_args.update({"summary_fields": args.summary_fields_})

    cluster_args = update_sample_parameters_args(cluster_args, args)

    if 'cluster' in args.json_args:
        update_json_args(cluster_args, args.json_args.get('cluster'), fields)

    return cluster_args
Example #6
0
def set_logistic_regression_args(args,
                                 name=None,
                                 fields=None,
                                 objective_id=None,
                                 logistic_regression_fields=None):
    """Return logistic regression arguments dict

    """
    if name is None:
        name = args.name
    if logistic_regression_fields is None:
        logistic_regression_fields = args.logistic_regression_fields_
    if objective_id is None:
        objective_id = args.objective_id_

    logistic_regression_args = set_basic_model_args(args, name)
    logistic_regression_args.update(
        {"seed": SEED if args.seed is None else args.seed})

    if objective_id is not None and fields is not None:
        logistic_regression_args.update({"objective_field": objective_id})
    if logistic_regression_fields and fields is not None:
        input_fields = configure_input_fields(fields,
                                              logistic_regression_fields)
        logistic_regression_args.update(input_fields=input_fields)
    if ((args.evaluate and args.test_split == 0 and args.test_datasets is None)
            or args.cross_validation_rate > 0):
        logistic_regression_args.update(seed=SEED)
        if args.cross_validation_rate > 0:
            args.sample_rate = 1 - args.cross_validation_rate
            args.replacement = False
        elif (args.sample_rate == 1 and args.test_datasets is None
              and not args.dataset_off):
            args.sample_rate = EVALUATE_SAMPLE_RATE
    logistic_regression_args.update({"sample_rate": args.sample_rate})
    if args.lr_c:
        logistic_regression_args.update({"c": args.lr_c})
    logistic_regression_args.update({"bias": args.bias})
    logistic_regression_args.update( \
        {"balance_fields": args.balance_fields})
    if args.eps:
        logistic_regression_args.update({"eps": args.eps})
    if args.normalize is not None:
        logistic_regression_args.update({"normalize": args.normalize})
    if args.missing_numerics is not None:
        logistic_regression_args.update( \
            {"missing_numerics": args.missing_numerics})
    if args.field_codings is not None:
        logistic_regression_args.update(\
            {"field_codings": args.field_codings_})

    logistic_regression_args = update_sample_parameters_args( \
        logistic_regression_args, args)

    if 'logistic_regression' in args.json_args:
        update_json_args(logistic_regression_args,
                         args.json_args.get('logistic_regression'), fields)
    return logistic_regression_args
Example #7
0
def set_ensemble_args(args, name=None,
                      objective_id=None, model_fields=None, fields=None):
    """Return ensemble arguments dict

    """
    if name is None:
        name = args.name
    if objective_id is None:
        objective_id = args.objective_id_
    if model_fields is None:
        model_fields = args.model_fields_

    ensemble_args = set_basic_model_args(args, name)
    ensemble_args.update({
        "missing_splits": args.missing_splits,
        "ensemble_sample": {"seed": SEED if args.ensemble_sample_seed is None \
            else args.ensemble_sample_seed},
        "seed": SEED if args.seed is None else args.seed
    })
    if objective_id is not None and fields is not None:
        ensemble_args.update({"objective_field": objective_id})

    if args.boosting:
        boosting_args = {}
        for option in BOOSTING_OPTIONS:
            if hasattr(args, option) and getattr(args, option) is not None:
                boosting_args.update({option: getattr(args, option)})
        ensemble_args.update({"boosting": boosting_args})
    else:
        ensemble_args.update({"number_of_models": args.number_of_models})

    # If evaluate flag is on and no test_split flag is provided,
    # we choose a deterministic sampling with
    # args.sample_rate (80% by default) of the data to create the model

    if (args.evaluate and args.test_split == 0 and
            args.test_datasets is None and not args.dataset_off):
        ensemble_args.update({"seed": SEED})
        if args.sample_rate == 1:
            args.sample_rate = EVALUATE_SAMPLE_RATE

    if model_fields and fields is not None:
        input_fields = configure_input_fields(fields, model_fields)
        ensemble_args.update(input_fields=input_fields)

    if args.pruning and args.pruning != 'smart':
        ensemble_args.update(stat_pruning=(args.pruning == 'statistical'))
    if args.node_threshold > 0:
        ensemble_args.update(node_threshold=args.node_threshold)
    if args.balance:
        ensemble_args.update(balance_objective=True)
    if args.weight_field:
        try:
            weight_field = fields.field_id(args.weight_field)
        except ValueError, exc:
            sys.exit(exc)
        ensemble_args.update(weight_field=weight_field)
Example #8
0
def set_topic_model_args(args,
                         name=None,
                         fields=None,
                         topic_model_fields=None):
    """Return topic_model arguments dict

    """
    if name is None:
        name = args.name
    if topic_model_fields is None:
        topic_model_fields = args.topic_model_fields_

    topic_model_args = set_basic_args(args, name)
    topic_model_args.update({
        "seed":
        SEED if args.seed is None else args.seed,
        "topicmodel_seed":
        SEED if args.seed is None else args.seed
    })

    if topic_model_fields and fields is not None:
        input_fields = configure_input_fields(fields, topic_model_fields)
        topic_model_args.update(input_fields=input_fields)
    topic_model_args.update({"sample_rate": args.sample_rate})
    topic_model_args.update({"bigrams": args.bigrams})
    topic_model_args.update({"case_sensitive": args.case_sensitive})
    if args.number_of_topics is not None:
        topic_model_args.update({"number_of_topics": args.number_of_topics})
    if args.term_limit is not None:
        topic_model_args.update({"term_limit": args.term_limit})
    if args.top_n_terms is not None:
        topic_model_args.update({"top_n_terms": args.top_n_terms})
    if args.minimum_name_terms is not None:
        topic_model_args.update(
            {"minimum_name_terms": args.minimum_name_terms})

    if args.excluded_terms:
        topic_model_args.update({"excluded_terms": args.excluded_terms_})

    topic_model_args = update_sample_parameters_args( \
        topic_model_args, args)

    if 'topic_model' in args.json_args:
        update_json_args(topic_model_args, args.json_args.get('topic_model'),
                         fields)
    return topic_model_args
Example #9
0
def set_deepnet_args(args,
                     name=None,
                     fields=None,
                     objective_id=None,
                     deepnet_fields=None):
    """Return deepnet arguments dict

    """
    if name is None:
        name = args.name
    if deepnet_fields is None:
        deepnet_fields = args.deepnet_fields_
    if objective_id is None:
        objective_id = args.objective_id_

    deepnet_args = set_basic_model_args(args, name)
    deepnet_args.update({"seed": SEED if args.seed is None else args.seed})

    if objective_id is not None and fields is not None:
        deepnet_args.update({"objective_field": objective_id})
    if deepnet_fields and fields is not None:
        input_fields = configure_input_fields(fields, deepnet_fields)
        deepnet_args.update(input_fields=input_fields)
    if ((args.evaluate and args.test_split == 0 and args.test_datasets is None)
            or args.cross_validation_rate > 0):
        deepnet_args.update(seed=SEED)
        if args.cross_validation_rate > 0:
            args.sample_rate = 1 - args.cross_validation_rate
            args.replacement = False
        elif (args.sample_rate == 1 and args.test_datasets is None
              and not args.dataset_off):
            args.sample_rate = EVALUATE_SAMPLE_RATE
    deepnet_args.update({"sample_rate": args.sample_rate})

    if args.batch_normalization is not None:
        deepnet_args.update({"batch_normalization": args.batch_normalization})
    if args.dropout_rate:
        deepnet_args.update({"dropout_rate": args.dropout_rate})

    if args.hidden_layers is not None:
        deepnet_args.update({"hidden_layers": args.hidden_layers_})

    if args.learn_residuals is not None:
        deepnet_args.update( \
            {"learn_residuals": args.learn_residuals})

    if args.max_iterations is not None:
        deepnet_args.update(\
            {"learning_rate": args.learning_rate})

    if args.max_training_time is not None:
        deepnet_args.update(\
            {"max_training_time": args.max_training_time})

    if args.number_of_hidden_layers is not None:
        deepnet_args.update(\
            {"number_of_hidden_layers": args.number_of_hidden_layers})

    if args.number_of_model_candidates is not None:
        deepnet_args.update(\
            {"number_of_model_candidates": args.number_of_model_candidates})

    if args.search is not None:
        deepnet_args.update(\
            {"search": args.search})

    if args.suggest_structure is not None:
        deepnet_args.update(\
            {"suggest_structure": args.suggest_structure})

    if not args.missing_numerics:
        deepnet_args.update(\
            {"missing_numerics": args.missing_numerics})

    if args.tree_embedding:
        deepnet_args.update(\
            {"tree_embedding": args.tree_embedding})

    deepnet_args = update_sample_parameters_args( \
        deepnet_args, args)

    if 'deepnet' in args.json_args:
        update_json_args(deepnet_args, args.json_args.get('deepnet'), fields)
    return deepnet_args
Example #10
0
    if hasattr(args, 'sql_query') and args.sql_query:
        dataset_args.update({"sql_query": args.sql_query})

    if hasattr(args, 'sql_output_fields_') and args.sql_output_fields_:
        dataset_args.update({"sql_output_fields": args.sql_output_fields_})

    if hasattr(args, 'json_query_') and args.json_query_:
        dataset_args.update({"json_query": args.json_query_})

    if args.json_filter:
        dataset_args.update(json_filter=args.json_filter)
    elif args.lisp_filter:
        dataset_args.update(lisp_filter=args.lisp_filter)

    if args.dataset_fields_ and fields is not None:
        input_fields = configure_input_fields(fields, args.dataset_fields_)
        dataset_args.update(input_fields=input_fields)
    if (hasattr(args, 'multi_label') and args.multi_label
            and multi_label_data is not None):
        dataset_args.update(
            user_metadata={'multi_label_data': multi_label_data})

    if fields and args.import_fields:
        fields_struct = fields.new_fields_structure(args.import_fields)
        check_fields_struct(fields_struct, "dataset")
        update_attributes(dataset_args, fields_struct)
    if 'dataset' in args.json_args:
        update_json_args(dataset_args, args.json_args.get('dataset'), fields)

    return dataset_args