Esempio n. 1
0
def local_batch_predict(models, headers, test_reader, exclude, fields, resume,
                        output_path, max_models, number_of_tests, api, output,
                        verbosity, method, objective_field, session_file,
                        debug):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    for row in test_reader:
        for index in exclude:
            del row[index]
        input_data_list.append(fields.pair(row, headers,
                                           objective_field))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                u.checkpoint(u.are_predictions_created,
                             pred_file,
                             number_of_tests, debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            complete_models.append(api.check_resource(
                models_split[index], api.get_model))

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path, reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
    message = u.dated("Combining predictions.\n")
    u.log_message(message, log_file=session_file, console=verbosity)
    for multivote in total_votes:
        u.write_prediction(multivote.combine(method), output)
Esempio n. 2
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE,
                        session_file=None, debug=False, prediction_info=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]

    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model, FIELDS_QS)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s" % (model,
                                                          str(exception)))
            complete_models.append(model)

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path,
                                  by_name=test_set_header,
                                  reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
Esempio n. 3
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct),
                    reset=True)

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split,
            api,
            query_string=query_string,
            labels=labels,
            multi_label_data=multi_label_data,
            ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers,
                    to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [
                multivote.predictions[0]['prediction'],
                multivote.predictions[0]['confidence']
            ]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote,
                options,
                labels,
                models_per_label,
                ordered,
                models_order,
                label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method,
                                           with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Esempio n. 4
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE, options=None,
                        session_file=None, debug=False,
                        prediction_info=NORMAL_FORMAT,
                        labels=None, label_separator=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" % (model,
                                                              str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data[
                    'generated_fields'][objective_column]
                labels_columns = [label_info[1] for label_info in labels_info
                                  if label_info[0] in labels]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            local_model.batch_predict(input_data_list,
                                      output_path,
                                      by_name=test_set_header,
                                      reuse=True)
            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes
Esempio n. 5
0
def local_batch_predict(models, test_reader, prediction_file, api, args,
                        resume=False, output_path=None, output=None,
                        method=PLURALITY_CODE, options=None,
                        session_file=None, labels=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """

    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct), reset=True)

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split, api, query_string=query_string, labels=labels,
            multi_label_data=multi_label_data, ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list, output_path, by_name=test_set_header,
                    reuse=True, missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers, to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [multivote.predictions[0]['prediction'],
                          multivote.predictions[0]['confidence']]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote, options, labels, models_per_label, ordered,
                models_order, label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method, with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Esempio n. 6
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        max_models=MAX_MODELS,
                        resume=False,
                        output_path=None,
                        output=None,
                        verbosity=True,
                        method=PLURALITY_CODE,
                        session_file=None,
                        debug=False,
                        prediction_info=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]

    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model, FIELDS_QS)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s" %
                             (model, str(exception)))
            complete_models.append(model)

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path,
                                  by_name=test_set_header,
                                  reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
Esempio n. 7
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    max_models = args.max_batch_models
    label_separator = args.label_separator
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" %
                             (model, str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data['generated_fields'][
                    objective_column]
                labels_columns = [
                    label_info[1] for label_info in labels_info
                    if label_info[0] in labels
                ]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            try:
                local_model.batch_predict(
                    input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes