Beispiel #1
0
    def batch_votes(self, predictions_file_path, data_locale=None):
        """Adds the votes for predictions generated by the models.

           Returns a list of predictions groups. A prediction group is a dict
           whose key is the prediction and its value is a list of the
           confidences with which the prediction has been issued
        """

        predictions_files = []
        for model in self.models:
            predictions_files.append((model, csv.reader(open(
                get_predictions_file_name(model.resource_id,
                                          predictions_file_path), "U"),
                lineterminator="\n")))
        votes = []
        predictions = {}
        prediction = True
        while prediction:
            if predictions:
                votes.append(predictions)
                predictions = {}
            for (model, handler) in predictions_files:
                try:
                    row = handler.next()
                    prediction = row[0]
                    confidence = float(row[1])
                except StopIteration:
                    prediction = False
                    break
                prediction = model.to_prediction(prediction, data_locale)
                if not prediction in predictions:
                    predictions[prediction] = []
                predictions[prediction].append(confidence)

        return votes
Beispiel #2
0
    def batch_predict(self, input_data_list, output_file_path,
                      by_name=True, reuse=False):
        """Makes predictions for a list of input data.

           The predictions generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
        """
        for model in self.models:
            output_file = get_predictions_file_name(model.resource_id,
                                                    output_file_path)
            if reuse:
                try:
                    predictions_file = open(output_file)
                    predictions_file.close()
                    continue
                except IOError:
                    pass
            try:
                predictions_file = csv.writer(open(output_file, 'w', 0),
                                              lineterminator="\n")
            except IOError:
                raise Exception("Cannot find %s directory." % output_file_path)
            for input_data in input_data_list:
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True)
                if isinstance(prediction[0], basestring):
                    prediction[0] = prediction[0].encode("utf-8")
                predictions_file.writerow(prediction)
Beispiel #3
0
def remote_predict_models(models, test_reader, prediction_file, api, args,
                          resume=False, output_path=None,
                          session_file=None, log=None, exclude=None):
    """Retrieve predictions remotely, combine them and save predictions to file

    """
    predictions_files = []
    prediction_args = {
        "tags": args.tag
    }
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    message_logged = False

    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    single_model = len(models) == 1
    if single_model:
        prediction_file = UnicodeWriter(prediction_file).open_writer()
    for model in models:
        model = bigml.api.get_model_id(model)
        predictions_file = get_predictions_file_name(model,
                                                     output_path)
        predictions_files.append(predictions_file)
        if (not resume or
                not c.checkpoint(c.are_predictions_created, predictions_file,
                                 test_reader.number_of_tests(),
                                 debug=args.debug)[0]):
            if not message_logged:
                message = u.dated("Creating remote predictions.\n")
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            message_logged = True
            with UnicodeWriter(predictions_file) as predictions_file:
                for input_data in raw_input_data_list:
                    input_data_dict = test_reader.dict(input_data)
                    prediction = api.create_prediction(model, input_data_dict,
                                                       by_name=test_set_header,
                                                       wait_time=0,
                                                       args=prediction_args)
                    u.check_resource_error(prediction,
                                           "Failed to create prediction: ")
                    u.log_message("%s\n" % prediction['resource'],
                                  log_file=log)
                    prediction_row = prediction_to_row(prediction)
                    predictions_file.writerow(prediction_row)
                    if single_model:
                        write_prediction(prediction_row[0:2], prediction_file,
                                         args.prediction_info,
                                         input_data, exclude)
    if single_model:
        prediction_file.close_writer()
    else:
        combine_votes(predictions_files,
                      Model(models[0]).to_prediction,
                      prediction_file, args.method,
                      args.prediction_info, raw_input_data_list, exclude)
Beispiel #4
0
def local_batch_predict(models, headers, test_reader, exclude, fields, resume,
                        output_path, max_models, number_of_tests, api, output,
                        verbosity, method, objective_field, session_file,
                        debug):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    for row in test_reader:
        for index in exclude:
            del row[index]
        input_data_list.append(fields.pair(row, headers,
                                           objective_field))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                u.checkpoint(u.are_predictions_created,
                             pred_file,
                             number_of_tests, debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            complete_models.append(api.check_resource(
                models_split[index], api.get_model))

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path, reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
    message = u.dated("Combining predictions.\n")
    u.log_message(message, log_file=session_file, console=verbosity)
    for multivote in total_votes:
        u.write_prediction(multivote.combine(method), output)
Beispiel #5
0
    def batch_votes(self, predictions_file_path, data_locale=None):
        """Adds the votes for predictions generated by the models.

           Returns a list of MultiVote objects each of which contains a list
           of predictions.
        """

        votes_files = []
        for model in self.models:
            votes_files.append(get_predictions_file_name(model.resource_id, predictions_file_path))
        return read_votes(votes_files, self.models[0].to_prediction, data_locale=data_locale)
Beispiel #6
0
def remote_predict(models, test_reader, prediction_file, api,
                   resume=False,
                   verbosity=True, output_path=None,
                   method=PLURALITY_CODE, tags="",
                   session_file=None, log=None, debug=False,
                   prediction_info=None):
    """Retrieve predictions remotely, combine them and save predictions to file

    """

    predictions_files = []
    prediction_args = {
        "tags": tags
    }
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    message_logged = False
    raw_input_data_list = []
    for model in models:
        model = bigml.api.get_model_id(model)
        predictions_file = get_predictions_file_name(model,
                                                     output_path)
        predictions_files.append(predictions_file)
        if (not resume or
            not c.checkpoint(c.are_predictions_created, predictions_file,
                             test_reader.number_of_tests(), debug=debug)):
            if not message_logged:
                message = u.dated("Creating remote predictions.")
                u.log_message(message, log_file=session_file,
                              console=verbosity)
            message_logged = True

            predictions_file = csv.writer(open(predictions_file, 'w', 0),
                                          lineterminator="\n")

            for input_data in test_reader:
                raw_input_data_list.append(input_data)
                input_data_dict = test_reader.dict(input_data)
                prediction = api.create_prediction(model, input_data_dict,
                                                   by_name=test_set_header,
                                                   wait_time=0,
                                                   args=prediction_args)
                u.check_resource_error(prediction,
                                       "Failed to create prediction: ")
                u.log_message("%s\n" % prediction['resource'], log_file=log)
                prediction_row = prediction_to_row(prediction)
                predictions_file.writerow(prediction_row)
    combine_votes(predictions_files,
                  Model(models[0]).to_prediction,
                  prediction_file, method,
                  prediction_info, raw_input_data_list)
Beispiel #7
0
def remote_predict(models, headers, output_path, number_of_tests, resume,
                   verbosity, test_reader, exclude, fields, api,
                   prediction_file, method, tags, objective_field,
                   session_file, test_set_header, log, debug):
    """Retrieve predictions remotely, combine them and save predictions to file

    """

    predictions_files = []
    prediction_args = {
        "tags": tags
    }
    for model in models:
        if not isinstance(model, basestring) and 'resource' in model:
            model = model['resource']
        predictions_file = get_predictions_file_name(model,
                                                     output_path)
        predictions_files.append(predictions_file)
        if (not resume or
            not u.checkpoint(u.are_predictions_created, predictions_file,
                             number_of_tests, debug=debug)):
            message = u.dated("Creating remote predictions.\n")
            u.log_message(message, log_file=session_file,
                          console=verbosity)

            predictions_file = csv.writer(open(predictions_file, 'w', 0))
            for row in test_reader:
                for index in exclude:
                    del row[index]
                input_data = fields.pair(row, headers, objective_field)
                prediction = api.create_prediction(model, input_data,
                                                   by_name=test_set_header,
                                                   wait_time=0,
                                                   args=prediction_args)
                u.log_message("%s\n" % prediction['resource'], log_file=log)
                prediction_row = u.prediction_to_row(prediction)
                predictions_file.writerow(prediction_row)
    u.combine_votes(predictions_files,
                    Model(models[0]).to_prediction,
                    prediction_file, method)
Beispiel #8
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE, options=None,
                        session_file=None, debug=False,
                        prediction_info=NORMAL_FORMAT,
                        labels=None, label_separator=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" % (model,
                                                              str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data[
                    'generated_fields'][objective_column]
                labels_columns = [label_info[1] for label_info in labels_info
                                  if label_info[0] in labels]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            local_model.batch_predict(input_data_list,
                                      output_path,
                                      by_name=test_set_header,
                                      reuse=True)
            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes
Beispiel #9
0
def local_batch_predict(models, test_reader, prediction_file, api, args,
                        resume=False, output_path=None, output=None,
                        method=PLURALITY_CODE, options=None,
                        session_file=None, labels=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """

    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct), reset=True)

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split, api, query_string=query_string, labels=labels,
            multi_label_data=multi_label_data, ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list, output_path, by_name=test_set_header,
                    reuse=True, missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers, to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [multivote.predictions[0]['prediction'],
                          multivote.predictions[0]['confidence']]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote, options, labels, models_per_label, ordered,
                models_order, label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method, with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Beispiel #10
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    predictions_file = csv.writer(open(output_file, 'w', 0),
                                                  lineterminator="\n")
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if to_file:
                    if isinstance(prediction[0], basestring):
                        prediction[0] = prediction[0].encode("utf-8")
                    predictions_file.writerow(prediction)
                else:
                    prediction, confidence, distribution, instances = prediction
                    prediction_row = [
                        prediction, confidence, order, distribution, instances
                    ]
                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)

        if not to_file:
            return votes
Beispiel #11
0
def remote_predict_models(models,
                          test_reader,
                          prediction_file,
                          api,
                          args,
                          resume=False,
                          output_path=None,
                          session_file=None,
                          log=None,
                          exclude=None):
    """Retrieve predictions remotely, combine them and save predictions to file

    """
    predictions_files = []
    prediction_args = {"tags": args.tag}
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    message_logged = False

    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    single_model = len(models) == 1
    if single_model:
        prediction_file = UnicodeWriter(prediction_file).open_writer()
    for model in models:
        model = bigml.api.get_model_id(model)
        predictions_file = get_predictions_file_name(model, output_path)
        predictions_files.append(predictions_file)
        if (not resume or not c.checkpoint(c.are_predictions_created,
                                           predictions_file,
                                           test_reader.number_of_tests(),
                                           debug=args.debug)[0]):
            if not message_logged:
                message = u.dated("Creating remote predictions.\n")
                u.log_message(message,
                              log_file=session_file,
                              console=args.verbosity)
            message_logged = True
            with UnicodeWriter(predictions_file) as predictions_file:
                for input_data in raw_input_data_list:
                    input_data_dict = test_reader.dict(input_data)
                    prediction = api.create_prediction(model,
                                                       input_data_dict,
                                                       by_name=test_set_header,
                                                       wait_time=0,
                                                       args=prediction_args)
                    u.check_resource_error(prediction,
                                           "Failed to create prediction: ")
                    u.log_message("%s\n" % prediction['resource'],
                                  log_file=log)
                    prediction_row = prediction_to_row(prediction)
                    predictions_file.writerow(prediction_row)
                    if single_model:
                        write_prediction(prediction_row[0:2], prediction_file,
                                         args.prediction_info, input_data,
                                         exclude)
    if single_model:
        prediction_file.close_writer()
    else:
        combine_votes(predictions_files,
                      Model(models[0]).to_prediction, prediction_file,
                      args.method, args.prediction_info, raw_input_data_list,
                      exclude)
Beispiel #12
0
    def batch_predict(self, input_data_list, output_file_path=None,
                      by_name=True, reuse=False,
                      missing_strategy=LAST_PREDICTION, headers=None,
                      to_file=True, use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list) and
                       headers is not None and
                       len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(model.resource_id,
                                                        output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0: 2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
Beispiel #13
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split,
            api,
            query_string=query_string,
            labels=labels,
            multi_label_data=multi_label_data,
            ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers,
                    to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [
                multivote.predictions[0]['prediction'],
                multivote.predictions[0]['confidence']
            ]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote,
                options,
                labels,
                models_per_label,
                ordered,
                models_order,
                label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method,
                                           with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Beispiel #14
0
    def batch_predict(self,
                      input_data_list,
                      output_file_path=None,
                      by_name=True,
                      reuse=False,
                      missing_strategy=LAST_PREDICTION,
                      headers=None,
                      to_file=True,
                      use_median=False):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list)
                       and headers is not None
                       and len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            out = None
            if to_file:
                output_file = get_predictions_file_name(
                    model.resource_id, output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    out = UnicodeWriter(output_file)
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            if out:
                out.open_writer()
            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if use_median and model.tree.regression:
                    # if median is to be used, we just place it as prediction
                    # starting the list
                    prediction[0] = prediction[-1]
                prediction = prediction[:-1]
                if to_file:
                    out.writerow(prediction)
                else:
                    # prediction is a row that contains prediction, confidence,
                    # distribution, instances
                    prediction_row = prediction[0:2]
                    prediction_row.append(order)
                    prediction_row.extend(prediction[2:])

                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)
            if out:
                out.close_writer()
        if not to_file:
            return votes
Beispiel #15
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    max_models = args.max_batch_models
    label_separator = args.label_separator
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" %
                             (model, str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data['generated_fields'][
                    objective_column]
                labels_columns = [
                    label_info[1] for label_info in labels_info
                    if label_info[0] in labels
                ]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            try:
                local_model.batch_predict(
                    input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes
Beispiel #16
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE,
                        session_file=None, debug=False, prediction_info=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]

    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model, FIELDS_QS)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s" % (model,
                                                          str(exception)))
            complete_models.append(model)

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path,
                                  by_name=test_set_header,
                                  reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
    def batch_predict(self, input_data_list, output_file_path=None,
                      by_name=True, reuse=False,
                      missing_strategy=LAST_PREDICTION, headers=None,
                      to_file=True):
        """Makes predictions for a list of input data.

           When the to_file argument is set to True, the predictions
           generated for each model are stored in an output
           file. The name of the file will use the following syntax:
                model_[id of the model]__predictions.csv
           For instance, when using model/50c0de043b563519830001c2 to predict,
           the output file name will be
                model_50c0de043b563519830001c2__predictions.csv
            On the contrary, if it is False, the function returns a list
            of MultiVote objects with the model's predictions.
        """
        add_headers = (isinstance(input_data_list[0], list) and
                       headers is not None and
                       len(headers) == len(input_data_list[0]))
        if not add_headers and not isinstance(input_data_list[0], dict):
            raise ValueError("Input data list is not a dictionary or the"
                             " headers and input data information are not"
                             " consistent.")
        order = 0
        if not to_file:
            votes = []

        for model in self.models:
            order += 1
            if to_file:
                output_file = get_predictions_file_name(model.resource_id,
                                                        output_file_path)
                if reuse:
                    try:
                        predictions_file = open(output_file)
                        predictions_file.close()
                        continue
                    except IOError:
                        pass
                try:
                    predictions_file = csv.writer(open(output_file, 'w', 0),
                                                  lineterminator="\n")
                except IOError:
                    raise Exception("Cannot find %s directory." %
                                    output_file_path)

            for index, input_data in enumerate(input_data_list):
                if add_headers:
                    input_data = dict(zip(headers, input_data))
                prediction = model.predict(input_data,
                                           by_name=by_name,
                                           with_confidence=True,
                                           missing_strategy=missing_strategy)
                if to_file:
                    if isinstance(prediction[0], basestring):
                        prediction[0] = prediction[0].encode("utf-8")
                    predictions_file.writerow(prediction)
                else:
                    prediction, confidence, distribution, instances = prediction
                    prediction_row = [prediction, confidence, order,
                                      distribution, instances]
                    if len(votes) <= index:
                        votes.append(MultiVote([]))
                    votes[index].append_row(prediction_row)

        if not to_file:
            return votes