Example #1
0
def remote_predict_models(models, test_reader, prediction_file, api, args,
                          resume=False, output_path=None,
                          session_file=None, log=None, exclude=None):
    """Retrieve predictions remotely, combine them and save predictions to file

    """
    predictions_files = []
    prediction_args = {
        "tags": args.tag
    }
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    message_logged = False

    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    single_model = len(models) == 1
    if single_model:
        prediction_file = UnicodeWriter(prediction_file).open_writer()
    for model in models:
        model = bigml.api.get_model_id(model)
        predictions_file = get_predictions_file_name(model,
                                                     output_path)
        predictions_files.append(predictions_file)
        if (not resume or
                not c.checkpoint(c.are_predictions_created, predictions_file,
                                 test_reader.number_of_tests(),
                                 debug=args.debug)[0]):
            if not message_logged:
                message = u.dated("Creating remote predictions.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            message_logged = True
            with UnicodeWriter(predictions_file) as predictions_file:
                for input_data in raw_input_data_list:
                    input_data_dict = test_reader.dict(input_data)
                    prediction = api.create_prediction(model, input_data_dict,
                                                       by_name=test_set_header,
                                                       wait_time=0,
                                                       args=prediction_args)
                    u.check_resource_error(prediction,
                                           "Failed to create prediction: ")
                    u.log_message("%s\n" % prediction['resource'],
                                  log_file=log)
                    prediction_row = prediction_to_row(prediction)
                    predictions_file.writerow(prediction_row)
                    if single_model:
                        write_prediction(prediction_row[0:2], prediction_file,
                                         args.prediction_info,
                                         input_data, exclude)
    if single_model:
        prediction_file.close_writer()
    else:
        combine_votes(predictions_files,
                      Model(models[0]).to_prediction,
                      prediction_file, args.method,
                      args.prediction_info, raw_input_data_list, exclude)
Example #2
0
def combine_votes(votes_files,
                  to_prediction,
                  to_file,
                  method=0,
                  prediction_info=NORMAL_FORMAT,
                  input_data_list=None,
                  exclude=None):
    """Combines the votes found in the votes' files and stores predictions.

       votes_files: should contain the list of file names
       to_prediction: is the Model method that casts prediction to numeric
                      type if needed
       to_file: is the name of the final output file.
    """
    votes = read_votes(votes_files, to_prediction)

    u.check_dir(to_file)
    with UnicodeWriter(to_file) as output:
        number_of_tests = len(votes)
        if input_data_list is None or len(input_data_list) != number_of_tests:
            input_data_list = None
        for index in range(0, number_of_tests):
            multivote = votes[index]
            input_data = (None if input_data_list is None else
                          input_data_list[index])
            write_prediction(multivote.combine(method, True), output,
                             prediction_info, input_data, exclude)
Example #3
0
def topic_distribution(topic_models, fields, args, session_file=None):
    """Computes a topic distribution for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude, headers = use_prediction_headers(test_reader, fields, args)

        # Local topic distributions: Topic distributions are computed
        # locally using topic models'
        # method
        message = u.dated("Creating local topic distributions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_topic_distribution(topic_models,
                                 test_reader,
                                 output,
                                 args,
                                 exclude=exclude,
                                 headers=headers)
    test_reader.close()
Example #4
0
def best_first_search(datasets_file,
                      api,
                      args,
                      command_obj,
                      staleness=None,
                      penalty=None,
                      objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(
        os.path.join(args.output_dir, FEATURES_LOG))
    features_writer = UnicodeWriter(features_file).open_writer()
    features_header = FEATURES_HEADER
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_PENALTY
    # retrieving the first dataset in the file
    try:
        with open(datasets_file, u.open_mode("r")) as datasets_handler:
            dataset_id = datasets_handler.readline().strip()
    except IOError, exc:
        sys.exit("Could not read the generated datasets file: %s" % str(exc))
Example #5
0
    def tree_CSV(self, file_name=None, leaves_only=False):
        """Outputs the node structure to a CSV file or array

        """
        headers_names = []
        if self.tree.regression:
            headers_names.append(self.fields[self.tree.objective_id]['name'])
            headers_names.append("error")
            for index in range(0, self._max_bins):
                headers_names.append("bin%s_value" % index)
                headers_names.append("bin%s_instances" % index)
        else:
            headers_names.append(self.fields[self.tree.objective_id]['name'])
            headers_names.append("confidence")
            headers_names.append("impurity")
            for category, _ in self.tree.distribution:
                headers_names.append(category)

        nodes_generator = self.get_nodes_info(headers_names,
                                              leaves_only=leaves_only)
        if file_name is not None:
            with UnicodeWriter(file_name) as writer:
                writer.writerow(
                    [header.encode("utf-8") for header in headers_names])
                for row in nodes_generator:
                    writer.writerow([
                        item if not isinstance(item, basestring) else
                        item.encode("utf-8") for item in row
                    ])
        else:
            rows = []
            rows.append(headers_names)
            for row in nodes_generator:
                rows.append(row)
            return rows
Example #6
0
def anomaly_score(anomalies, fields, args, session_file=None):
    """Computes an anomaly score for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output,
                                         test_reader, fields, args)

        # Local anomaly scores: Anomaly scores are computed locally using
        # the local anomaly detector method
        message = u.dated("Creating local anomaly scores.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_anomaly_score(anomalies,
                            test_reader,
                            output,
                            args,
                            exclude=exclude)
    test_reader.close()
Example #7
0
def prediction(models, fields, args, session_file=None):
    """Computes a supervised model prediction
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header,
                                         output,
                                         test_reader,
                                         fields,
                                         args,
                                         args.objective_field,
                                         quality="probability")

        # Local predictions: Predictions are computed locally
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_prediction(models, test_reader, output, args, exclude=exclude)
    test_reader.close()
Example #8
0
def sample_file(sample, fields, args, api, path=None, session_file=None):
    """Creates a file for each sample with the sample rows.

    """
    query_string = sample_query_string(args, fields)
    sample = r.get_samples([sample],
                           args,
                           api,
                           session_file=session_file,
                           query_string=query_string)[0][0]
    output = args.predictions
    with UnicodeWriter(output, lineterminator="\n") as output:
        headers = [
            field['name'] for field in sample['object']['sample']['fields']
        ]
        if args.sample_header:
            if args.row_index or args.occurrence:
                new_headers = []
                if args.row_index:
                    new_headers.append("index")
                if args.occurrence:
                    new_headers.append("occurrences")
                    new_headers.extend(headers)
                headers = new_headers
            output.writerow(headers)
        for row in sample['object']['sample']['rows']:
            output.writerow(row)
        if args.stat_field or args.stat_fields:
            stat_info = {}
            sample_obj = sample['object']['sample']
            for key in STAT_KEYS:
                if key in sample_obj:
                    stat_info[key] = sample_obj[key]
            with open(os.path.join(path, "stat_info.json"), "w") as stat_file:
                json.dump(stat_info, stat_file)
Example #9
0
    def statistics_csv(self, file_name=None):
        """Clusters statistic information in CSV format

        """
        rows = []
        writer = None
        field_ids = self.centroids[0].center.keys()
        headers = [u"Centroid_name"]
        headers.extend(
            [u"%s" % self.fields[field_id]["name"] for field_id in field_ids])
        headers.extend([u"Instances"])
        intercentroids = False
        header_complete = False

        centroids_list = sorted(self.centroids, key=lambda x: x.name)
        for centroid in centroids_list:
            row = [centroid.name]
            row.extend(
                self.centroid_features(centroid, field_ids, encode=False))
            row.append(centroid.count)
            if len(self.centroids) > 1:
                for measure, result in self.centroids_distance(centroid):
                    if not intercentroids:
                        headers.append(u"%s intercentroid distance" % \
                            measure.title())
                    row.append(result)
                intercentroids = True
            for measure, result in centroid.distance.items():
                if measure in CSV_STATISTICS:
                    if not header_complete:
                        headers.append(u"Distance %s" %
                                       measure.lower().replace("_", " "))
                    row.append(result)
            if not header_complete:
                rows.append(headers)
                header_complete = True
            rows.append(row)

        if self.cluster_global:
            row = [u"%s" % self.cluster_global.name]
            row.extend(
                self.centroid_features(self.cluster_global,
                                       field_ids,
                                       encode=False))
            row.append(self.cluster_global.count)
            if len(self.centroids) > 1:
                for measure, result in self.cluster_global_distance():
                    row.append(result)
            for measure, result in self.cluster_global.distance.items():
                if measure in CSV_STATISTICS:
                    row.append(result)
            # header is already in rows then insert cluster_global after it
            rows.insert(1, row)

        if file_name is None:
            return rows
        with UnicodeWriter(file_name) as writer:
            writer.writerows(rows)
Example #10
0
    def rules_csv(self, file_name, **kwargs):
        """Stores the rules in CSV format in the user-given file. The rules
           can be previously selected using the arguments in get_rules

        """
        rules = self.get_rules(**kwargs)
        rules = [self.describe(rule.to_csv()) for rule in rules]
        if file_name is None:
            raise ValueError("A valid file name is required to store the "
                             "rules.")
        with UnicodeWriter(file_name, quoting=csv.QUOTE_NONNUMERIC) as writer:
            writer.writerow(RULE_HEADERS)
            for rule in rules:
                writer.writerow([item if not isinstance(item, str)
                                 else item.encode("utf-8")
                                 for item in rule])
Example #11
0
    def statistics_csv(self, file_name=None):
        """Clusters statistic information in CSV format

        """
        rows = []
        writer = None
        field_ids = self.centroids[0].center.keys()
        headers = [u"centroid_name"]
        headers.extend(
            [u"%s" % self.fields[field_id]["name"] for field_id in field_ids])
        headers.extend([u"Instances"])
        intercentroids = False
        header_complete = False
        for centroid in self.centroids:
            row = [centroid.name]
            row.extend(self.centroid_features(centroid, field_ids))
            row.append(centroid.count)
            if len(self.centroids) > 1:
                for measure, result in self.centroids_distance(centroid):
                    if not intercentroids:
                        headers.append(u"Intercentroids %s" % measure.lower())
                    row.append(result)
                intercentroids = True
            for measure, result in centroid.distance.items():
                if measure in CSV_STATISTICS:
                    if not header_complete:
                        headers.append(u"Data %s" %
                                       measure.lower().replace("_", " "))
                    row.append(result)
            if not header_complete:
                rows.append(headers)
                header_complete = True
            rows.append(row)

        if file_name is None:
            return rows
        with UnicodeWriter(file_name) as writer:
            for row in rows:
                writer.writerow([
                    item if not isinstance(item, basestring) else
                    item.encode("utf-8") for item in row
                ])
Example #12
0
def write_forecasts(forecast, output):
    """Writes the final forecast to the required output

    The function creates a new file per field used in the forecast input data.
    The id of the field will be appended to the name provided in the `output`
    parameter.
    """

    for objective_id, forecast_value in forecast.items():
        headers = [f["model"] for f in forecast_value]
        points = []
        if not forecast_value:
            sys.exit("No forecasts available")
        for index in range(len(forecast_value[0]["point_forecast"])):
            points.append([f["point_forecast"][index] for f in forecast_value])
        output_file = "%s_%s.csv" % (output, objective_id)
        with UnicodeWriter(output_file, lineterminator="\n") as out_handler:
            out_handler.writerow(headers)
            for row in points:
                out_handler.writerow(row)
Example #13
0
def remote_predict_ensemble(ensemble_id,
                            test_reader,
                            prediction_file,
                            api,
                            args,
                            resume=False,
                            output_path=None,
                            session_file=None,
                            log=None,
                            exclude=None):
    """Retrieve predictions remotely and save predictions to file

    """
    prediction_args = {"tags": args.tag, "combiner": args.method}
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)

    if (not resume or not c.checkpoint(c.are_predictions_created,
                                       prediction_file,
                                       test_reader.number_of_tests(),
                                       debug=args.debug)[0]):
        message = u.dated("Creating remote predictions.")
        u.log_message(message, log_file=session_file, console=args.verbosity)

        with UnicodeWriter(prediction_file) as predictions_file:
            for input_data in test_reader:
                input_data_dict = test_reader.dict(input_data)
                prediction = api.create_prediction(ensemble_id,
                                                   input_data_dict,
                                                   by_name=test_set_header,
                                                   wait_time=0,
                                                   args=prediction_args)
                prediction = u.check_resource(prediction, api.get_prediction)
                u.check_resource_error(prediction,
                                       "Failed to create prediction: ")
                u.log_message("%s\n" % prediction['resource'], log_file=log)
                prediction_row = prediction_to_row(prediction,
                                                   args.prediction_info)
                write_prediction(prediction_row, predictions_file,
                                 args.prediction_info, input_data, exclude)
Example #14
0
def centroid(clusters, fields, args, session_file=None):
    """Computes a centroid for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.predictions
    test_reader = TestReader(test_set, test_set_header, fields,
                             None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(
            args.prediction_header, output, test_reader, fields, args)

        # Local centroids: Centroids are computed locally using clusters'
        # centroids distances
        message = u.dated("Creating local centroids.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_centroid(clusters, test_reader, output, args, exclude=exclude)
    test_reader.close()
Example #15
0
def tree_csv(model, file_name=None, leaves_only=False):
    """Outputs the node structure to a CSV file or array

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    headers_names = []
    if model.regression:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("error")
        max_bins = get_node(model.tree)[model.offsets["max_bins"]]
        for index in range(0, max_bins):
            headers_names.append("bin%s_value" % index)
            headers_names.append("bin%s_instances" % index)
    else:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("confidence")
        headers_names.append("impurity")
        node = get_node(model.tree)
        for category, _ in node[model.offsets["distribution"]]:
            headers_names.append(category)

    nodes_generator = get_nodes_info(model,
                                     headers_names,
                                     leaves_only=leaves_only)
    if file_name is not None:
        with UnicodeWriter(file_name) as writer:
            writer.writerow([utf8(header) for header in headers_names])
            for row in nodes_generator:
                writer.writerow([
                    item if not isinstance(item, str) else utf8(item)
                    for item in row
                ])
        return file_name
    rows = []
    rows.append(headers_names)
    for row in nodes_generator:
        rows.append(row)
    return rows
Example #16
0
def projection(pca, fields, args, session_file=None):
    """Computes the projection
    for each entry in the `test_set`.

    """
    test_set = args.test_set
    test_set_header = args.test_header
    output = args.projections
    test_reader = TestReader(test_set, test_set_header, fields, None,
                             test_separator=args.test_separator)
    with UnicodeWriter(output, lineterminator="\n") as output:
        local_pca, kwargs = _local_pca(pca, args)
        pca_headers = ["PC%s" % (i + 1) for i in \
            range(0, len(local_pca.projection({})))]
        # columns to exclude if input_data is added to the projections field
        exclude = use_projection_headers(
            args.projection_header, output, test_reader, fields, args,
            pca_headers)
        # Local projection: Projections are computed locally
        message = u.dated("Creating local projections.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        local_projection(local_pca, kwargs, test_reader,
                         output, args, exclude=exclude)
    test_reader.close()
Example #17
0
    def summary_csv(self, filename=None):
        """Summary of the contents of the fields

        """

        summary = []
        writer = None
        if filename is not None:
            writer = UnicodeWriter(filename).open_writer()
            writer.writerow(SUMMARY_HEADERS)
        else:
            summary.append(SUMMARY_HEADERS)

        for field_column in self.fields_columns:
            field_id = self.field_id(field_column)
            field = self.fields.get(field_id)
            field_summary = []
            field_summary.append(field.get('column_number'))
            field_summary.append(field_id)
            field_summary.append(field.get('name'))
            field_summary.append(field.get('label'))
            field_summary.append(field.get('description'))
            field_summary.append(field.get('optype'))
            field_summary_value = field.get('summary', {})

            if not field_summary_value:
                field_summary.append("")  # no preferred info
                field_summary.append("")  # no missing info
                field_summary.append("")  # no error info
                field_summary.append("")  # no content summary
                field_summary.append("")  # no error summary
            else:
                field_summary.append(json.dumps(field.get('preferred')))
                field_summary.append(field_summary_value.get("missing_count"))
                if self.field_errors and field_id in self.field_errors.keys():
                    errors = self.field_errors.get(field_id)
                    field_summary.append(errors.get("total"))
                else:
                    field_summary.append("0")
                if field['optype'] == 'numeric':
                    field_summary.append("[%s, %s], mean: %s" % \
                        (field_summary_value.get("minimum"),
                         field_summary_value.get("maximum"),
                         field_summary_value.get("mean")))
                elif field['optype'] == 'categorical':
                    categories = field_summary_value.get("categories")
                    field_summary.append( \
                        attribute_summary(categories, u"categorìes",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "text":
                    terms = field_summary_value.get("tag_cloud")
                    field_summary.append( \
                        attribute_summary(terms, u"terms",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "items":
                    items = field_summary_value.get("items")
                    field_summary.append( \
                        attribute_summary(items, u"items", limit=LIST_LIMIT))
                else:
                    field_summary.append("")
                if self.field_errors and field_id in self.field_errors.keys():
                    field_summary.append( \
                        attribute_summary(errors.get("sample"), u"errors",
                                          limit=None))
                else:
                    field_summary.append("")
            if writer:
                writer.writerow(field_summary)
            else:
                summary.append(field_summary)
        if writer is None:
            return summary
        else:
            writer.close_writer()
Example #18
0
def best_candidates_number(datasets_file, args, common_options,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(os.path.join(args.output_dir,
                                                    CANDIDATES_LOG))
    candidates_writer = UnicodeWriter(candidates_file).open_writer()
    candidates_writer.writerow(CANDIDATES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "random"))
    max_candidates = args.max_candidates + 1

    if args.nodes_step is None:
        args.nodes_step = DEFAULT_CANDIDATES_STEP
    random_candidates = args.min_candidates

    if penalty is None:
        penalty = DEFAULT_CANDIDATES_PENALTY
    best_score = - float('inf')
    metric = args.optimize
    score = best_score
    best_counter = 0
    while random_candidates < max_candidates:
        loop_counter += 1
        (score,
         metric_value,
         metric,
         resume) = candidates_evaluate(datasets_file, args,
                                       random_candidates, common_options,
                                       penalty=penalty, resume=resume,
                                       metric=metric)
        candidates_writer.writerow([
            loop_counter, random_candidates, score, metric_value,
            best_score])
        if (score - EPSILON) > best_score:
            best_candidates = random_candidates
            best_score = score
            best_counter = loop_counter
            message = 'New best random candidates number is: %s\n' % \
                best_candidates
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value,
                                                      score)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
        random_candidates += DEFAULT_CANDIDATES_STEP
    if args.predictions_csv:
        resume = create_prediction_dataset(output_dir,
                                           "random%s" % best_counter,
                                           args, resume)
    message = ('The best random candidates number is: %s \n'
               % best_candidates)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    candidates_writer.close_writer()
    return best_candidates
Example #19
0
def best_node_threshold(datasets_file, args, common_options,
                        staleness=None, penalty=None,
                        resume=False):
    """Selecting the node_limit to be used in the model construction

    """
    loop_counter = 0
    nodes_file = os.path.normpath(os.path.join(args.output_dir,
                                               NODES_LOG))
    nodes_writer = UnicodeWriter(nodes_file).open_writer()
    nodes_writer.writerow(NODES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "node_th"))
    max_nodes = args.max_nodes + 1

    if args.min_nodes is None:
        args.min_nodes = DEFAULT_MIN_NODES
    if args.nodes_step is None:
        args.nodes_step = DEFAULT_NODES_STEP
    node_threshold = args.min_nodes
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_NODES_PENALTY
    best_score = - float('inf')
    best_unchanged_count = 0
    metric = args.optimize
    score = best_score
    best_counter = 0
    while best_unchanged_count < staleness and node_threshold < max_nodes:
        loop_counter += 1
        (score,
         metric_value,
         metric,
         resume) = node_threshold_evaluate(datasets_file, args,
                                           node_threshold, common_options,
                                           penalty=penalty, resume=resume,
                                           metric=metric)
        nodes_writer.writerow([
            loop_counter - 1, node_threshold, score, metric_value, best_score])
        if (score - EPSILON) > best_score:
            best_threshold = node_threshold
            best_score = score
            best_unchanged_count = 0
            best_counter = loop_counter
            message = 'New best node threshold: %s\n' % (best_threshold)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value,
                                                      score)
            u.log_message(message, log_file=session_file,
                          console=args.verbosity)
        else:
            best_unchanged_count += 1
        node_threshold += args.nodes_step
    if args.predictions_csv:
        resume = create_prediction_dataset(output_dir,
                                           "node_th%s" % best_counter,
                                           args, resume)
    message = ('The best node threshold is: %s \n'
               % best_threshold)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    nodes_writer.close_writer()
    return best_threshold
Example #20
0
    def batch_predict(self, input_data_list, output_file_path=None,
                      by_name=True, reuse=False,
                      missing_strategy=LAST_PREDICTION, headers=None,
                      to_file=True, use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list) and
                       headers is not None and
                       len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(model.resource_id,
                                                        output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0: 2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
Example #21
0
def best_candidates_number(datasets_file,
                           args,
                           command_obj,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(
        os.path.join(args.output_dir, CANDIDATES_LOG))
    candidates_writer = UnicodeWriter(candidates_file).open_writer()
    candidates_writer.writerow(CANDIDATES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir, "random"))
    max_candidates = args.max_candidates + 1

    if args.nodes_step is None:
        args.nodes_step = DEFAULT_CANDIDATES_STEP
    random_candidates = args.min_candidates

    if penalty is None:
        penalty = DEFAULT_CANDIDATES_PENALTY
    best_score = -float('inf')
    metric = args.optimize
    score = best_score
    best_counter = 0
    while random_candidates < max_candidates:
        loop_counter += 1
        (score, metric_value, metric,
         resume) = candidates_evaluate(datasets_file,
                                       args,
                                       random_candidates,
                                       command_obj,
                                       penalty=penalty,
                                       resume=resume,
                                       metric=metric)
        candidates_writer.writerow(
            [loop_counter, random_candidates, score, metric_value, best_score])
        if (score - EPSILON) > best_score:
            best_candidates = random_candidates
            best_score = score
            best_counter = loop_counter
            message = 'New best random candidates number is: %s\n' % \
                best_candidates
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value, score)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
        random_candidates += DEFAULT_CANDIDATES_STEP
    if args.predictions_csv:
        resume = create_prediction_dataset(args.output_dir,
                                           "random%s" % best_counter, args,
                                           resume)
    message = ('The best random candidates number is: %s \n' % best_candidates)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    candidates_writer.close_writer()
    return best_candidates
Example #22
0
def best_node_threshold(datasets_file,
                        args,
                        command_obj,
                        staleness=None,
                        penalty=None,
                        resume=False):
    """Selecting the node_limit to be used in the model construction

    """
    loop_counter = 0
    nodes_file = os.path.normpath(os.path.join(args.output_dir, NODES_LOG))
    nodes_writer = UnicodeWriter(nodes_file).open_writer()
    nodes_writer.writerow(NODES_HEADER)
    args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                    "node_th"))
    max_nodes = args.max_nodes + 1

    if args.min_nodes is None:
        args.min_nodes = DEFAULT_MIN_NODES
    if args.nodes_step is None:
        args.nodes_step = DEFAULT_NODES_STEP
    node_threshold = args.min_nodes
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_NODES_PENALTY
    best_score = -float('inf')
    best_unchanged_count = 0
    metric = args.optimize
    score = best_score
    best_counter = 0
    while best_unchanged_count < staleness and node_threshold < max_nodes:
        loop_counter += 1
        (score, metric_value, metric,
         resume) = node_threshold_evaluate(datasets_file,
                                           args,
                                           node_threshold,
                                           command_obj,
                                           penalty=penalty,
                                           resume=resume,
                                           metric=metric)
        nodes_writer.writerow([
            loop_counter - 1, node_threshold, score, metric_value, best_score
        ])
        if (score - EPSILON) > best_score:
            best_threshold = node_threshold
            best_score = score
            best_unchanged_count = 0
            best_counter = loop_counter
            message = 'New best node threshold: %s\n' % (best_threshold)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
            if metric in PERCENT_EVAL_METRICS:
                message = '%s = %0.2f%% (score = %s)\n' % (
                    metric.capitalize(), metric_value * 100, score)
            else:
                message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                      metric_value, score)
            u.log_message(message,
                          log_file=session_file,
                          console=args.verbosity)
        else:
            best_unchanged_count += 1
        node_threshold += args.nodes_step
    if args.predictions_csv:
        resume = create_prediction_dataset(args.output_dir,
                                           "node_th%s" % best_counter, args,
                                           resume)
    message = ('The best node threshold is: %s \n' % best_threshold)
    u.log_message(message, log_file=session_file, console=1)
    if metric in PERCENT_EVAL_METRICS:
        message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                       (best_score * 100)))
    else:
        message = ('%s = %f\n' % (metric.capitalize(), best_score))
    u.log_message(message, log_file=session_file, console=1)
    nodes_writer.close_writer()
    return best_threshold
Example #23
0
    def summary_csv(self, filename=None):
        """Summary of the contents of the fields

        """

        summary = []
        writer = None
        if filename is not None:
            writer = UnicodeWriter(filename).open_writer()
            writer.writerow(SUMMARY_HEADERS)
        else:
            summary.append(SUMMARY_HEADERS)

        for field_column in self.fields_columns:
            field_id = self.field_id(field_column)
            field = self.fields.get(field_id)
            field_summary = []
            field_summary.append(field.get('column_number'))
            field_summary.append(field_id)
            field_summary.append(field.get('name'))
            field_summary.append(field.get('label'))
            field_summary.append(field.get('description'))
            field_summary.append(field.get('optype'))
            field_summary_value = field.get('summary', {})

            if not field_summary_value:
                field_summary.append("") # no preferred info
                field_summary.append("") # no missing info
                field_summary.append("") # no error info
                field_summary.append("") # no content summary
                field_summary.append("") # no error summary
            else:
                field_summary.append(json.dumps(field.get('preferred')))
                field_summary.append(field_summary_value.get("missing_count"))
                if self.field_errors and field_id in self.field_errors.keys():
                    errors = self.field_errors.get(field_id)
                    field_summary.append(errors.get("total"))
                else:
                    field_summary.append("0")
                if field['optype'] == 'numeric':
                    field_summary.append("[%s, %s], mean: %s" % \
                        (field_summary_value.get("minimum"),
                         field_summary_value.get("maximum"),
                         field_summary_value.get("mean")))
                elif field['optype'] == 'categorical':
                    categories = field_summary_value.get("categories")
                    field_summary.append( \
                        attribute_summary(categories, u"categorìes",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "text":
                    terms = field_summary_value.get("tag_cloud")
                    field_summary.append( \
                        attribute_summary(terms, u"terms",
                                          limit=LIST_LIMIT))
                elif field['optype'] == "items":
                    items = field_summary_value.get("items")
                    field_summary.append( \
                        attribute_summary(items, u"items", limit=LIST_LIMIT))
                else:
                    field_summary.append("")
                if self.field_errors and field_id in self.field_errors.keys():
                    field_summary.append( \
                        attribute_summary(errors.get("sample"), u"errors",
                                          limit=None))
                else:
                    field_summary.append("")
            if writer:
                writer.writerow(field_summary)
            else:
                summary.append(field_summary)
        if writer is None:
            return summary
        else:
            writer.close_writer()
Example #24
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True,
                      use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0:2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
Example #25
0
def predict(models,
            fields,
            args,
            api=None,
            log=None,
            resume=False,
            session_file=None,
            labels=None,
            models_per_label=1,
            other_label=OTHER,
            multi_label_data=None):
    """Computes a prediction for each entry in the `test_set`.

       Predictions computed locally using MultiModels on subgroups of models.
       Chosing a max_batch_models value not bigger than the number_of_models
       flag will lead to the last case, where memory usage is bounded and each
       model predictions are saved for further use.
    """
    test_set = args.test_set
    test_set_header = args.test_header
    objective_field = args.objective_field
    output = args.predictions
    test_reader = TestReader(test_set,
                             test_set_header,
                             fields,
                             objective_field,
                             test_separator=args.test_separator)

    prediction_file = output
    output_path = u.check_dir(output)
    with UnicodeWriter(output) as output:
        # columns to exclude if input_data is added to the prediction field
        exclude = use_prediction_headers(args.prediction_header, output,
                                         test_reader, fields, args,
                                         objective_field)

        # Remote predictions: predictions are computed in bigml.com and stored
        # in a file named after the model in the following syntax:
        #     model_[id of the model]__predictions.csv
        # For instance,
        #     model_50c0de043b563519830001c2_predictions.csv
        # Predictions are computed individually only if no_batch flag is set
        if args.remote and args.no_batch and not args.multi_label:
            if args.ensemble is not None:
                remote_predict_ensemble(args.ensemble, test_reader,
                                        prediction_file, api, args, resume,
                                        output_path, session_file, log,
                                        exclude)
            else:
                remote_predict_models(models, test_reader, prediction_file,
                                      api, args, resume, output_path,
                                      session_file, log, exclude)
            return
        # Local predictions: Predictions are computed locally using models'
        # rules with MultiModel's predict method
        message = u.dated("Creating local predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)
        options = {}
        if args.method == THRESHOLD_CODE:
            options.update(threshold=args.threshold)
            if args.threshold_class is None:
                local_model = Model(models[0])
                # default class is the first class that appears in the dataset
                # objective field summary, which might be different from the
                # objective summary of each model becaus model are built with
                # sampling
                objective_field = local_model.objective_id
                distribution = local_model.tree.fields[objective_field][ \
                    "summary"]["categories"]
                args.threshold_class = distribution[0][0]
            options.update(category=args.threshold_class)
        # For a model we build a Model and for a small number of models,
        # we build a MultiModel using all of
        # the given models and issue a combined prediction
        if (len(models) <= args.max_batch_models \
                and args.fast and \
                not args.multi_label and args.max_categories == 0 \
                and args.method != COMBINATION):
            local_predict(models, test_reader, output, args, options, exclude)
        elif args.boosting:
            local_predict(args.ensemble, test_reader, output, args, options,
                          exclude)
        # For large numbers of models, we split the list of models in chunks
        # and build a MultiModel for each chunk, issue and store predictions
        # for each model and combine all of them eventually.
        else:
            # Local predictions: predictions are computed locally using
            # models' rules with MultiModel's predict method and combined using
            # aggregation if the objective field is a multi-labelled field
            # or one of the available combination methods: plurality,
            # confidence weighted and probability weighted
            if args.multi_label:
                method = AGGREGATION
            elif args.max_categories > 0:
                method = COMBINATION
            else:
                method = args.method

            # For multi-labelled models, the --models flag keeps the order
            # of the labels and the models but the --model-tag flag
            # retrieves the models with no order, so the correspondence with
            # each label must be restored.
            ordered = True
            if args.multi_label and (args.model_tag is not None
                                     or models_per_label > 1):
                ordered = False
            local_batch_predict(models,
                                test_reader,
                                prediction_file,
                                api,
                                args,
                                resume=resume,
                                output_path=output_path,
                                output=output,
                                method=method,
                                options=options,
                                session_file=session_file,
                                labels=labels,
                                ordered=ordered,
                                exclude=exclude,
                                models_per_label=models_per_label,
                                other_label=other_label,
                                multi_label_data=multi_label_data)
    test_reader.close()
Example #26
0
def remote_predict_models(models,
                          test_reader,
                          prediction_file,
                          api,
                          args,
                          resume=False,
                          output_path=None,
                          session_file=None,
                          log=None,
                          exclude=None):
    """Retrieve predictions remotely, combine them and save predictions to file

    """
    predictions_files = []
    prediction_args = {"tags": args.tag}
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    message_logged = False

    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    single_model = len(models) == 1
    if single_model:
        prediction_file = UnicodeWriter(prediction_file).open_writer()
    for model in models:
        model = bigml.api.get_model_id(model)
        predictions_file = get_predictions_file_name(model, output_path)
        predictions_files.append(predictions_file)
        if (not resume or not c.checkpoint(c.are_predictions_created,
                                           predictions_file,
                                           test_reader.number_of_tests(),
                                           debug=args.debug)[0]):
            if not message_logged:
                message = u.dated("Creating remote predictions.\n")
                u.log_message(message,
                              log_file=session_file,
                              console=args.verbosity)
            message_logged = True
            with UnicodeWriter(predictions_file) as predictions_file:
                for input_data in raw_input_data_list:
                    input_data_dict = test_reader.dict(input_data)
                    prediction = api.create_prediction(model,
                                                       input_data_dict,
                                                       wait_time=0,
                                                       args=prediction_args)
                    u.check_resource_error(prediction,
                                           "Failed to create prediction: ")
                    u.log_message("%s\n" % prediction['resource'],
                                  log_file=log)
                    prediction_row = prediction_to_row(prediction)
                    predictions_file.writerow(prediction_row)
                    if single_model:
                        write_prediction(prediction_row[0:2], prediction_file,
                                         args.prediction_info, input_data,
                                         exclude)
    if single_model:
        prediction_file.close_writer()
    else:
        combine_votes(predictions_files,
                      Model(models[0]).to_prediction, prediction_file,
                      args.method, args.prediction_info, raw_input_data_list,
                      exclude)