예제 #1
0
def topic_distribution(topic_models, fields, args, session_file=None):
    """Computes a topic distribution for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude, headers = use_prediction_headers(test_reader, fields, args)

        # Local topic distributions: Topic distributions are computed
        # locally using topic models'
        # method
        message = u.dated("Creating local topic distributions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_topic_distribution(topic_models,
                                 test_reader,
                                 output,
                                 args,
                                 exclude=exclude,
                                 headers=headers)
    test_reader.close()
예제 #2
0
def prediction(models, fields, args, session_file=None):
    """Computes a supervised model prediction
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header,
                                         output,
                                         test_reader,
                                         fields,
                                         args,
                                         args.objective_field,
                                         quality="probability")

        # Local predictions: Predictions are computed locally
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_prediction(models, test_reader, output, args, exclude=exclude)
    test_reader.close()
예제 #3
0
def anomaly_score(anomalies, fields, args, session_file=None):
    """Computes an anomaly score for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output,
                                         test_reader, fields, args)

        # Local anomaly scores: Anomaly scores are computed locally using
        # the local anomaly detector method
        message = u.dated("Creating local anomaly scores.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_anomaly_score(anomalies,
                            test_reader,
                            output,
                            args,
                            exclude=exclude)
    test_reader.close()
예제 #4
0
def centroid(clusters, fields, args, session_file=None):
    """Computes a centroid for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args)

        # Local centroids: Centroids are computed locally using clusters'
        # centroids distances
        message = u.dated("Creating local centroids.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_centroid(clusters, test_reader, output, args, exclude=exclude)
    test_reader.close()
예제 #5
0
def anomaly_score(anomalies, fields, args, session_file=None):
    """Computes an anomaly score for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields, None, test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output, test_reader, fields, args)

        # Local anomaly scores: Anomaly scores are computed locally using
        # the local anomaly detector method
        message = u.dated("Creating local anomaly scores.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_anomaly_score(anomalies, test_reader, output, args, exclude=exclude)
    test_reader.close()
예제 #6
0
def centroid(clusters, fields, args, session_file=None):
    """Computes a centroid for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(
            args.prediction_header, output, test_reader, fields, args)

        # Local centroids: Centroids are computed locally using clusters'
        # centroids distances
        message = u.dated("Creating local centroids.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_centroid(clusters, test_reader, output, args, exclude=exclude)
    test_reader.close()
예제 #7
0
def topic_distribution(topic_models, fields, args, session_file=None):
    """Computes a topic distribution for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude, headers = use_prediction_headers(
            test_reader, fields, args)

        # Local topic distributions: Topic distributions are computed
        # locally using topic models'
        # method
        message = u.dated("Creating local topic distributions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_topic_distribution(topic_models, test_reader, output,
                                 args, exclude=exclude, headers=headers)
    test_reader.close()
예제 #8
0
def lr_prediction(linear_regressions, fields, args, session_file=None):
    """Computes a linear regression prediction
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(
            args.prediction_header, output, test_reader, fields, args,
            args.objective_field, quality="probability")

        # Local predictions: Predictions are computed locally
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_prediction(linear_regressions, test_reader,
                         output, args, exclude=exclude)
    test_reader.close()
예제 #9
0
def projection(pca, fields, args, session_file=None):
    """Computes the projection
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.projections
    test_reader = TestReader(test_set, test_set_header, fields, None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        local_pca, kwargs = _local_pca(pca, args)
        pca_headers = ["PC%s" % (i + 1) for i in \
            range(0, len(local_pca.projection({})))]
        # columns to exclude if input_data is added to the projections field
        exclude = use_projection_headers(
            args.projection_header, output, test_reader, fields, args,
            pca_headers)
        # Local projection: Projections are computed locally
        message = u.dated("Creating local projections.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_projection(local_pca, kwargs, test_reader,
                         output, args, exclude=exclude)
    test_reader.close()
예제 #10
0
def projection(pca, fields, args, session_file=None):
    """Computes the projection
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.projections
    test_reader = TestReader(test_set, test_set_header, fields, None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        local_pca, kwargs = _local_pca(pca, args)
        pca_headers = ["PC%s" % (i + 1) for i in \
            range(0, len(local_pca.projection({})))]
        # columns to exclude if input_data is added to the projections field
        exclude = use_projection_headers(
            args.projection_header, output, test_reader, fields, args,
            pca_headers)
        # Local projection: Projections are computed locally
        message = u.dated("Creating local projections.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_projection(local_pca, kwargs, test_reader,
                         output, args, exclude=exclude)
    test_reader.close()
예제 #11
0
def predict(models,
            fields,
            args,
            api=None,
            log=None,
            resume=False,
            session_file=None,
            labels=None,
            models_per_label=1,
            other_label=OTHER,
            multi_label_data=None):
    """Computes a prediction for each entry in the `test_set`.

       Predictions computed locally using MultiModels on subgroups of models.
       Chosing a max_batch_models value not bigger than the number_of_models
       flag will lead to the last case, where memory usage is bounded and each
       model predictions are saved for further use.
    """
    test_set = args.test_set
    test_set_header = args.test_header
    objective_field = args.objective_field
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             objective_field,
                             test_separator=args.test_separator)

    prediction_file = output
    output_path = u.check_dir(output)
    with UnicodeWriter(output) as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output,
                                         test_reader, fields, args,
                                         objective_field)

        # Remote predictions: predictions are computed in bigml.com and stored
        # in a file named after the model in the following syntax:
        #     model_[id of the model]__predictions.csv
        # For instance,
        #     model_50c0de043b563519830001c2_predictions.csv
        # Predictions are computed individually only if no_batch flag is set
        if args.remote and args.no_batch and not args.multi_label:
            if args.ensemble is not None:
                remote_predict_ensemble(args.ensemble, test_reader,
                                        prediction_file, api, args, resume,
                                        output_path, session_file, log,
                                        exclude)
            else:
                remote_predict_models(models, test_reader, prediction_file,
                                      api, args, resume, output_path,
                                      session_file, log, exclude)
            return
        # Local predictions: Predictions are computed locally using models'
        # rules with MultiModel's predict method
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        options = {}
        if args.method == THRESHOLD_CODE:
            options.update(threshold=args.threshold)
            if args.threshold_class is None:
                local_model = Model(models[0])
                # default class is the first class that appears in the dataset
                # objective field summary, which might be different from the
                # objective summary of each model becaus model are built with
                # sampling
                objective_field = local_model.objective_id
                distribution = local_model.tree.fields[objective_field][ \
                    "summary"]["categories"]
                args.threshold_class = distribution[0][0]
            options.update(category=args.threshold_class)
        # For a model we build a Model and for a small number of models,
        # we build a MultiModel using all of
        # the given models and issue a combined prediction
        if (len(models) <= args.max_batch_models \
                and args.fast and \
                not args.multi_label and args.max_categories == 0 \
                and args.method != COMBINATION):
            local_predict(models, test_reader, output, args, options, exclude)
        elif args.boosting:
            local_predict(args.ensemble, test_reader, output, args, options,
                          exclude)
        # For large numbers of models, we split the list of models in chunks
        # and build a MultiModel for each chunk, issue and store predictions
        # for each model and combine all of them eventually.
        else:
            # Local predictions: predictions are computed locally using
            # models' rules with MultiModel's predict method and combined using
            # aggregation if the objective field is a multi-labelled field
            # or one of the available combination methods: plurality,
            # confidence weighted and probability weighted
            if args.multi_label:
                method = AGGREGATION
            elif args.max_categories > 0:
                method = COMBINATION
            else:
                method = args.method

            # For multi-labelled models, the --models flag keeps the order
            # of the labels and the models but the --model-tag flag
            # retrieves the models with no order, so the correspondence with
            # each label must be restored.
            ordered = True
            if args.multi_label and (args.model_tag is not None
                                     or models_per_label > 1):
                ordered = False
            local_batch_predict(models,
                                test_reader,
                                prediction_file,
                                api,
                                args,
                                resume=resume,
                                output_path=output_path,
                                output=output,
                                method=method,
                                options=options,
                                session_file=session_file,
                                labels=labels,
                                ordered=ordered,
                                exclude=exclude,
                                models_per_label=models_per_label,
                                other_label=other_label,
                                multi_label_data=multi_label_data)
    test_reader.close()
예제 #12
0
def predict(models, fields, args, api=None, log=None,
            resume=False, session_file=None,
            labels=None, models_per_label=1, other_label=OTHER,
            multi_label_data=None):
    """Computes a prediction for each entry in the `test_set`.

       Predictions computed locally using MultiModels on subgroups of models.
       Chosing a max_batch_models value not bigger than the number_of_models
       flag will lead to the last case, where memory usage is bounded and each
       model predictions are saved for further use.
    """
    test_set = args.test_set
    test_set_header = args.test_header
    objective_field = args.objective_field
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields,
                             objective_field,
                             test_separator=args.test_separator)

    prediction_file = output
    output_path = u.check_dir(output)
    with UnicodeWriter(output) as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(
            args.prediction_header, output, test_reader, fields, args,
            objective_field)

        # Remote predictions: predictions are computed in bigml.com and stored
        # in a file named after the model in the following syntax:
        #     model_[id of the model]__predictions.csv
        # For instance,
        #     model_50c0de043b563519830001c2_predictions.csv
        # Predictions are computed individually only if no_batch flag is set
        if (args.remote and args.no_batch and not args.multi_label
                and args.method != THRESHOLD_CODE):
            if args.ensemble is not None:
                remote_predict_ensemble(args.ensemble, test_reader,
                                        prediction_file, api, args, resume,
                                        output_path, session_file, log,
                                        exclude)
            else:
                remote_predict_models(models, test_reader, prediction_file,
                                      api, args, resume, output_path,
                                      session_file, log, exclude)
            return
        # Local predictions: Predictions are computed locally using models'
        # rules with MultiModel's predict method
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        options = {}
        if args.method == THRESHOLD_CODE:
            options.update(threshold=args.threshold)
            if args.threshold_class is None:
                local_model = Model(models[0])
                args.threshold_class = local_model.tree.distribution[0][0]
            options.update(category=args.threshold_class)
        # For a model we build a Model and for a small number of models,
        # we build a MultiModel using all of
        # the given models and issue a combined prediction
        if (len(models) <= args.max_batch_models and args.fast and
                not args.multi_label and args.max_categories == 0
                and args.method != COMBINATION):
            local_predict(models, test_reader, output, args, options, exclude)
        # For large numbers of models, we split the list of models in chunks
        # and build a MultiModel for each chunk, issue and store predictions
        # for each model and combine all of them eventually.
        else:
            # Local predictions: predictions are computed locally using
            # models' rules with MultiModel's predict method and combined using
            # aggregation if the objective field is a multi-labelled field
            # or one of the available combination methods: plurality,
            # confidence weighted and probability weighted
            if args.multi_label:
                method = AGGREGATION
            elif args.max_categories > 0:
                method = COMBINATION
            else:
                method = args.method

            # For multi-labelled models, the --models flag keeps the order
            # of the labels and the models but the --model-tag flag
            # retrieves the models with no order, so the correspondence with
            # each label must be restored.
            ordered = True
            if args.multi_label and (args.model_tag is not None
                                     or models_per_label > 1):
                ordered = False
            local_batch_predict(models, test_reader, prediction_file, api,
                                args, resume=resume,
                                output_path=output_path,
                                output=output, method=method, options=options,
                                session_file=session_file, labels=labels,
                                ordered=ordered, exclude=exclude,
                                models_per_label=models_per_label,
                                other_label=other_label,
                                multi_label_data=multi_label_data)
    test_reader.close()