Beispiel #1
0
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_resource(self.api, model_id,
                                        query_string=ONLY_MODEL)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models, self.api)
Beispiel #2
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
Beispiel #3
0
    def _combine_distributions(self,
                               input_data,
                               missing_strategy,
                               method=PROBABILITY_CODE):
        """Computes the predicted distributions and combines them to give the
        final predicted distribution. Depending on the method parameter
        probability, votes or the confidence are used to weight the models.

        """

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVoteList([])

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields,
                                         class_names=self.class_names)

                votes_split = multi_model.generate_votes_distribution( \
                    input_data,
                    missing_strategy=missing_strategy,
                    method=method)
                votes.extend(votes_split)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes = self.multi_model.generate_votes_distribution( \
                input_data,
                missing_strategy=missing_strategy, method=method)

        return votes.combine_to_distribution(normalize=False)
Beispiel #4
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
Beispiel #5
0
def local_batch_predict(models, headers, test_reader, exclude, fields, resume,
                        output_path, max_models, number_of_tests, api, output,
                        verbosity, method, objective_field, session_file,
                        debug):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    for row in test_reader:
        for index in exclude:
            del row[index]
        input_data_list.append(fields.pair(row, headers,
                                           objective_field))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                u.checkpoint(u.are_predictions_created,
                             pred_file,
                             number_of_tests, debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            complete_models.append(api.check_resource(
                models_split[index], api.get_model))

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path, reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
    message = u.dated("Combining predictions.\n")
    u.log_message(message, log_file=session_file, console=verbosity)
    for multivote in total_votes:
        u.write_prediction(multivote.combine(method), output)
Beispiel #6
0
def local_predict(models, test_reader, output, method, prediction_info=None):
    """Get local predictions and combine them to get a final prediction

    """
    local_model = MultiModel(models)
    test_set_header = test_reader.has_headers()
    for input_data in test_reader:
        prediction = local_model.predict(input_data,
                                         by_name=test_set_header,
                                         method=method,
                                         with_confidence=True)
        u.write_prediction(prediction, output, prediction_info)
Beispiel #7
0
def local_predict(models, test_reader, output, method):
    """Get local predictions and combine them to get a final prediction

    """
    local_model = MultiModel(models)
    test_set_header = test_reader.has_headers()
    for input_data in test_reader:
        prediction = local_model.predict(input_data,
                                         by_name=test_set_header,
                                         method=method,
                                         with_confidence=True)
        u.write_prediction(prediction, output)
Beispiel #8
0
def local_predict(models, headers, test_reader, exclude, fields, method,
                  objective_field, output, test_set_header):
    """Get local predictions, combine them and save predictions to file

    """
    local_model = MultiModel(models)
    for row in test_reader:
        for index in exclude:
            del row[index]
        input_data = fields.pair(row, headers, objective_field)
        prediction = local_model.predict(input_data,
                                         by_name=test_set_header,
                                         method=method)
        u.write_prediction(prediction, output)
Beispiel #9
0
def local_predict(models, test_reader, output, args, options=None,
                  exclude=None):
    """Get local predictions and combine them to get a final prediction

    """
    local_model = MultiModel(models)
    test_set_header = test_reader.has_headers()
    for input_data in test_reader:
        input_data_dict = test_reader.dict(input_data)
        prediction = local_model.predict(
            input_data_dict, by_name=test_set_header, method=args.method,
            with_confidence=True, options=options,
            missing_strategy=args.missing_strategy)
        write_prediction(prediction, output,
                         args.prediction_info, input_data, exclude)
Beispiel #10
0
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_resource(self.api, model_id,
                                        query_string=ONLY_MODEL)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models, self.api)
Beispiel #11
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """
        votes = MultiVote([])
        for models_split in self.models_splits:
            models = [retrieve_model(self.api, model_id)
                      for model_id in models_split]
            multi_model = MultiModel(models)
            votes_split = multi_model.generate_votes(input_data,
                                                     by_name=by_name)
            votes.extend(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence)
Beispiel #12
0
def local_predict(models,
                  test_reader,
                  output,
                  args,
                  options=None,
                  exclude=None):
    """Get local predictions and combine them to get a final prediction

    """
    local_model = MultiModel(models)
    test_set_header = test_reader.has_headers()
    for input_data in test_reader:
        input_data_dict = test_reader.dict(input_data)
        prediction = local_model.predict(
            input_data_dict,
            by_name=test_set_header,
            method=args.method,
            with_confidence=True,
            options=options,
            missing_strategy=args.missing_strategy)
        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Beispiel #13
0
class Ensemble(object):
    """A local predictive Ensemble.

       Uses a number of BigML remote models to build an ensemble local version
       that can be used to generate predictions locally.

    """
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
        self.model_ids = models
        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [
                models[index:(index + max_models)]
                for index in range(0, number_of_models, max_models)
            ]
        if len(self.models_splits) == 1:
            models = [
                retrieve_model(self.api, model_id)
                for model_id in self.models_splits[0]
            ]
            self.multi_model = MultiModel(models)

    def list_models(self):
        """Lists all the model/ids that compound the ensemble.

        """
        return self.model_ids

    def predict(self,
                input_data,
                by_name=True,
                method=PLURALITY_CODE,
                with_confidence=False):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [
                    retrieve_model(self.api, model_id)
                    for model_id in models_split
                ]
                multi_model = MultiModel(models)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence)
Beispiel #14
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        max_models=MAX_MODELS,
                        resume=False,
                        output_path=None,
                        output=None,
                        verbosity=True,
                        method=PLURALITY_CODE,
                        session_file=None,
                        debug=False,
                        prediction_info=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]

    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model, FIELDS_QS)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s" %
                             (model, str(exception)))
            complete_models.append(model)

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path,
                                  by_name=test_set_header,
                                  reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
Beispiel #15
0
    def __init__(self, ensemble, api=None, max_models=None, cache_get=None):

        self.model_splits = []
        self.multi_model = None
        self.api = get_api_connection(api)
        self.fields = None
        self.class_names = None
        if use_cache(cache_get):
            # using a cache to store the model attributes
            self.__dict__ = load(get_ensemble_id(ensemble), cache_get)
            self.api = get_api_connection(api)
            if len(self.models_splits) == 1:
                # retrieve the models from a cache get function
                try:
                    models = [
                        Model(model_id, cache_get=cache_get)
                        for model_id in self.models_splits[0]
                    ]
                except Exception as exc:
                    raise Exception('Error while calling the user-given'
                                    ' function %s: %s' %
                                    (cache_get.__name__, str(exc)))
                self.multi_model = MultiModel(models,
                                              self.api,
                                              fields=self.fields,
                                              class_names=self.class_names,
                                              cache_get=cache_get)
            return

        self.resource_id = None
        self.objective_id = None
        self.distributions = None
        self.distribution = None
        self.boosting = None
        self.boosting_offsets = None
        self.cache_get = None
        self.regression = False
        self.importance = {}
        query_string = ONLY_MODEL
        no_check_fields = False
        self.input_fields = []
        if isinstance(ensemble, list):
            if all([isinstance(model, Model) for model in ensemble]):
                models = ensemble
                self.model_ids = [
                    local_model.resource_id for local_model in models
                ]
            else:
                try:
                    models = [get_model_id(model) for model in ensemble]
                    self.model_ids = models
                except ValueError as exc:
                    raise ValueError('Failed to verify the list of models.'
                                     ' Check your model id values: %s' %
                                     str(exc))

        else:
            ensemble = self.get_ensemble_resource(ensemble)
            self.resource_id = get_ensemble_id(ensemble)
            if not check_local_but_fields(ensemble):
                # avoid checking fields because of old ensembles
                ensemble = retrieve_resource(self.api,
                                             self.resource_id,
                                             no_check_fields=True)

            if ensemble['object'].get('type') == BOOSTING:
                self.boosting = ensemble['object'].get('boosting')
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', [])
            self.importance = ensemble['object'].get('importance', [])
            self.model_ids = models
            # new ensembles have the fields structure
            if ensemble['object'].get('ensemble'):
                self.fields = ensemble['object'].get( \
                    'ensemble', {}).get("fields")
                self.objective_id = ensemble['object'].get("objective_field")
                query_string = EXCLUDE_FIELDS
                no_check_fields = True
            self.input_fields = ensemble['object'].get('input_fields')

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [
                models[index:(index + max_models)]
                for index in range(0, number_of_models, max_models)
            ]
        if len(self.models_splits) == 1:
            if not isinstance(models[0], Model):
                if use_cache(cache_get):
                    # retrieve the models from a cache get function
                    try:
                        models = [
                            Model(model_id, cache_get=cache_get)
                            for model_id in self.models_splits[0]
                        ]
                        self.cache_get = cache_get
                    except Exception as exc:
                        raise Exception('Error while calling the user-given'
                                        ' function %s: %s' %
                                        (cache_get.__name__, str(exc)))
                else:
                    models = [retrieve_resource( \
                        self.api,
                        model_id,
                        query_string=query_string,
                        no_check_fields=no_check_fields)
                              for model_id in self.models_splits[0]]
            model = models[0]

        else:
            # only retrieving first model
            self.cache_get = cache_get
            if not isinstance(models[0], Model):
                if use_cache(cache_get):
                    # retrieve the models from a cache get function
                    try:
                        model = Model(self.models_splits[0][0],
                                      cache_get=cache_get)
                        self.cache_get = cache_get
                    except Exception as exc:
                        raise Exception('Error while calling the user-given'
                                        ' function %s: %s' %
                                        (cache_get.__name__, str(exc)))
                else:
                    model = retrieve_resource( \
                        self.api,
                        self.models_splits[0][0],
                        query_string=query_string,
                        no_check_fields=no_check_fields)

                models = [model]

        if self.distributions is None:
            try:
                self.distributions = []
                for model in models:
                    self.distributions.append(
                        {'training': model.root_distribution})
            except AttributeError:
                self.distributions = [
                    model['object']['model']['distribution']
                    for model in models
                ]

        if self.boosting is None:
            self._add_models_attrs(model, max_models)

        if self.fields is None:
            self.fields, self.objective_id = self.all_model_fields(
                max_models=max_models)

        if self.fields:
            add_distribution(self)
        self.regression = \
            self.fields[self.objective_id].get('optype') == NUMERIC
        if self.boosting:
            self.boosting_offsets = ensemble['object'].get('initial_offset',
                                                           0) \
                if self.regression else dict(ensemble['object'].get( \
                    'initial_offsets', []))
        if not self.regression:
            try:
                objective_field = self.fields[self.objective_id]
                categories = objective_field['summary']['categories']
                classes = [category[0] for category in categories]
            except (AttributeError, KeyError):
                classes = set()
                for distribution in self.distributions:
                    for category in distribution['training']['categories']:
                        classes.add(category[0])

            self.class_names = sorted(classes)
            self.objective_categories = [category for \
                category, _ in self.fields[self.objective_id][ \
               "summary"]["categories"]]

        ModelFields.__init__( \
            self, self.fields,
            objective_id=self.objective_id)

        if len(self.models_splits) == 1:
            self.multi_model = MultiModel(models,
                                          self.api,
                                          fields=self.fields,
                                          class_names=self.class_names)
Beispiel #16
0
class Ensemble(ModelFields):
    """A local predictive Ensemble.

       Uses a number of BigML remote models to build an ensemble local version
       that can be used to generate predictions locally.
       The expected arguments are:

       ensemble: ensemble object or id, list of model objects or
                 ids or list of local model objects (see Model)
       api: connection object. If None, a new connection object is
            instantiated.
       max_models: integer that limits the number of models instantiated and
                   held in memory at the same time while predicting. If None,
                   no limit is set and all the ensemble models are
                   instantiated and held in memory permanently.
       cache_get: user-provided function that should return the JSON
                  information describing the model or the corresponding
                  Model object. Can be used to read these objects from a
                  cache storage.
    """
    def __init__(self, ensemble, api=None, max_models=None, cache_get=None):

        self.model_splits = []
        self.multi_model = None
        self.api = get_api_connection(api)
        self.fields = None
        self.class_names = None
        if use_cache(cache_get):
            # using a cache to store the model attributes
            self.__dict__ = load(get_ensemble_id(ensemble), cache_get)
            self.api = get_api_connection(api)
            if len(self.models_splits) == 1:
                # retrieve the models from a cache get function
                try:
                    models = [
                        Model(model_id, cache_get=cache_get)
                        for model_id in self.models_splits[0]
                    ]
                except Exception as exc:
                    raise Exception('Error while calling the user-given'
                                    ' function %s: %s' %
                                    (cache_get.__name__, str(exc)))
                self.multi_model = MultiModel(models,
                                              self.api,
                                              fields=self.fields,
                                              class_names=self.class_names,
                                              cache_get=cache_get)
            return

        self.resource_id = None
        self.objective_id = None
        self.distributions = None
        self.distribution = None
        self.boosting = None
        self.boosting_offsets = None
        self.cache_get = None
        self.regression = False
        self.importance = {}
        query_string = ONLY_MODEL
        no_check_fields = False
        self.input_fields = []
        if isinstance(ensemble, list):
            if all([isinstance(model, Model) for model in ensemble]):
                models = ensemble
                self.model_ids = [
                    local_model.resource_id for local_model in models
                ]
            else:
                try:
                    models = [get_model_id(model) for model in ensemble]
                    self.model_ids = models
                except ValueError as exc:
                    raise ValueError('Failed to verify the list of models.'
                                     ' Check your model id values: %s' %
                                     str(exc))

        else:
            ensemble = self.get_ensemble_resource(ensemble)
            self.resource_id = get_ensemble_id(ensemble)
            if not check_local_but_fields(ensemble):
                # avoid checking fields because of old ensembles
                ensemble = retrieve_resource(self.api,
                                             self.resource_id,
                                             no_check_fields=True)

            if ensemble['object'].get('type') == BOOSTING:
                self.boosting = ensemble['object'].get('boosting')
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', [])
            self.importance = ensemble['object'].get('importance', [])
            self.model_ids = models
            # new ensembles have the fields structure
            if ensemble['object'].get('ensemble'):
                self.fields = ensemble['object'].get( \
                    'ensemble', {}).get("fields")
                self.objective_id = ensemble['object'].get("objective_field")
                query_string = EXCLUDE_FIELDS
                no_check_fields = True
            self.input_fields = ensemble['object'].get('input_fields')

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [
                models[index:(index + max_models)]
                for index in range(0, number_of_models, max_models)
            ]
        if len(self.models_splits) == 1:
            if not isinstance(models[0], Model):
                if use_cache(cache_get):
                    # retrieve the models from a cache get function
                    try:
                        models = [
                            Model(model_id, cache_get=cache_get)
                            for model_id in self.models_splits[0]
                        ]
                        self.cache_get = cache_get
                    except Exception as exc:
                        raise Exception('Error while calling the user-given'
                                        ' function %s: %s' %
                                        (cache_get.__name__, str(exc)))
                else:
                    models = [retrieve_resource( \
                        self.api,
                        model_id,
                        query_string=query_string,
                        no_check_fields=no_check_fields)
                              for model_id in self.models_splits[0]]
            model = models[0]

        else:
            # only retrieving first model
            self.cache_get = cache_get
            if not isinstance(models[0], Model):
                if use_cache(cache_get):
                    # retrieve the models from a cache get function
                    try:
                        model = Model(self.models_splits[0][0],
                                      cache_get=cache_get)
                        self.cache_get = cache_get
                    except Exception as exc:
                        raise Exception('Error while calling the user-given'
                                        ' function %s: %s' %
                                        (cache_get.__name__, str(exc)))
                else:
                    model = retrieve_resource( \
                        self.api,
                        self.models_splits[0][0],
                        query_string=query_string,
                        no_check_fields=no_check_fields)

                models = [model]

        if self.distributions is None:
            try:
                self.distributions = []
                for model in models:
                    self.distributions.append(
                        {'training': model.root_distribution})
            except AttributeError:
                self.distributions = [
                    model['object']['model']['distribution']
                    for model in models
                ]

        if self.boosting is None:
            self._add_models_attrs(model, max_models)

        if self.fields is None:
            self.fields, self.objective_id = self.all_model_fields(
                max_models=max_models)

        if self.fields:
            add_distribution(self)
        self.regression = \
            self.fields[self.objective_id].get('optype') == NUMERIC
        if self.boosting:
            self.boosting_offsets = ensemble['object'].get('initial_offset',
                                                           0) \
                if self.regression else dict(ensemble['object'].get( \
                    'initial_offsets', []))
        if not self.regression:
            try:
                objective_field = self.fields[self.objective_id]
                categories = objective_field['summary']['categories']
                classes = [category[0] for category in categories]
            except (AttributeError, KeyError):
                classes = set()
                for distribution in self.distributions:
                    for category in distribution['training']['categories']:
                        classes.add(category[0])

            self.class_names = sorted(classes)
            self.objective_categories = [category for \
                category, _ in self.fields[self.objective_id][ \
               "summary"]["categories"]]

        ModelFields.__init__( \
            self, self.fields,
            objective_id=self.objective_id)

        if len(self.models_splits) == 1:
            self.multi_model = MultiModel(models,
                                          self.api,
                                          fields=self.fields,
                                          class_names=self.class_names)

    def _add_models_attrs(self, model, max_models=None):
        """ Adds the boosting and fields info when the ensemble is built from
            a list of models. They can be either Model objects
            or the model dictionary info structure.

        """
        if isinstance(model, Model):
            self.boosting = model.boosting
            boosted_list_error(self.boosting)
            self.objective_id = model.objective_id
        else:
            if model['object'].get('boosted_ensemble'):
                self.boosting = model['object']['boosting']
            boosted_list_error(self.boosting)
            if self.fields is None:
                self.fields, _ = self.all_model_fields( \
                    max_models=max_models)
                self.objective_id = model['object']['objective_field']

    def get_ensemble_resource(self, ensemble):
        """Extracts the ensemble resource info. The ensemble argument can be
           - a path to a local file
           - an ensemble id
        """
        # the string can be a path to a JSON file
        if isinstance(ensemble, str):
            try:
                path = os.path.dirname(os.path.abspath(ensemble))
                with open(ensemble) as ensemble_file:
                    ensemble = json.load(ensemble_file)
                    self.resource_id = get_ensemble_id(ensemble)
                    if self.resource_id is None:
                        raise ValueError("The JSON file does not seem"
                                         " to contain a valid BigML ensemble"
                                         " representation.")
                    self.api.storage = path
            except IOError:
                # if it is not a path, it can be an ensemble id
                self.resource_id = get_ensemble_id(ensemble)
                if self.resource_id is None:
                    if ensemble.find('ensemble/') > -1:
                        raise Exception(
                            self.api.error_message(ensemble,
                                                   resource_type='ensemble',
                                                   method='get'))
                    raise IOError("Failed to open the expected JSON file"
                                  " at %s" % ensemble)
            except ValueError:
                raise ValueError("Failed to interpret %s."
                                 " JSON file expected.")
        return ensemble

    def list_models(self):
        """Lists all the model/ids that compound the ensemble.

        """
        return self.model_ids

    def predict_probability(self,
                            input_data,
                            missing_strategy=LAST_PREDICTION,
                            compact=False):
        """For classification models, Predicts a probability for
        each possible output class, based on input values.  The input
        fields must be a dictionary keyed by field name or field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "probability"
                        mapped to the name of the class and it's probability,
                        respectively.  If True, returns a list of probabilities
                        ordered by the sorted order of the class names.
        """
        if self.regression:
            prediction = self.predict(input_data,
                                      method=PROBABILITY_CODE,
                                      missing_strategy=missing_strategy,
                                      full=not compact)

            if compact:
                output = [prediction]
            else:
                output = prediction
        elif self.boosting is not None:
            probabilities = self.predict(input_data,
                                         method=PLURALITY_CODE,
                                         missing_strategy=missing_strategy,
                                         full=True)['probabilities']

            probabilities.sort(key=lambda x: x['category'])

            if compact:
                output = [
                    probability['probability'] for probability in probabilities
                ]
            else:
                output = probabilities
        else:
            output = self._combine_distributions( \
                input_data,
                missing_strategy)

            if not compact:
                names_probabilities = list(zip(self.class_names, output))
                output = [{
                    'category': class_name,
                    'probability': probability
                } for class_name, probability in names_probabilities]

        return output

    def predict_confidence(self,
                           input_data,
                           missing_strategy=LAST_PREDICTION,
                           compact=False):
        """For classification models, Predicts a confidence for
        each possible output class, based on input values.  The input
        fields must be a dictionary keyed by field name or field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "probability"
                        mapped to the name of the class and it's probability,
                        respectively.  If True, returns a list of probabilities
                        ordered by the sorted order of the class names.
        """

        if self.boosting:
            # we use boosting probabilities as confidences also
            return self.predict_probability( \
                input_data,
                missing_strategy=missing_strategy,
                compact=compact)
        if self.regression:
            prediction = self.predict(input_data,
                                      method=CONFIDENCE_CODE,
                                      missing_strategy=missing_strategy,
                                      full=not compact)
            if compact:
                output = [prediction]
            else:
                output = prediction
        else:
            output = self._combine_distributions( \
                input_data,
                missing_strategy,
                method=CONFIDENCE_CODE)
            if not compact:
                names_confidences = list(zip(self.class_names, output))
                output = [{
                    'category': class_name,
                    'confidence': confidence
                } for class_name, confidence in names_confidences]

        return output

    def predict_votes(self,
                      input_data,
                      missing_strategy=LAST_PREDICTION,
                      compact=False):
        """For classification models, Predicts the votes for
        each possible output class, based on input values.  The input
        fields must be a dictionary keyed by field name or field ID.

        For regressions, the output is a single element list
        containing the prediction.

        :param input_data: Input data to be predicted
        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
                                 for missing fields
        :param compact: If False, prediction is returned as a list of maps, one
                        per class, with the keys "prediction" and "probability"
                        mapped to the name of the class and it's probability,
                        respectively.  If True, returns a list of probabilities
                        ordered by the sorted order of the class names.
        """
        if self.regression:
            prediction = self.predict(input_data,
                                      method=PLURALITY_CODE,
                                      missing_strategy=missing_strategy,
                                      full=not compact)

            if compact:
                output = [prediction]
            else:
                output = prediction
        elif self.boosting is not None:
            raise ValueError("Votes cannot be computed for boosted"
                             " ensembles.")
        else:
            output = self._combine_distributions( \
                input_data,
                missing_strategy,
                method=PLURALITY_CODE)
            if not compact:
                names_votes = list(zip(self.class_names, output))
                output = [{
                    'category': class_name,
                    'votes': k
                } for class_name, k in names_votes]

        return output

    def _combine_distributions(self,
                               input_data,
                               missing_strategy,
                               method=PROBABILITY_CODE):
        """Computes the predicted distributions and combines them to give the
        final predicted distribution. Depending on the method parameter
        probability, votes or the confidence are used to weight the models.

        """

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVoteList([])

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields,
                                         class_names=self.class_names)

                votes_split = multi_model.generate_votes_distribution( \
                    input_data,
                    missing_strategy=missing_strategy,
                    method=method)
                votes.extend(votes_split)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes = self.multi_model.generate_votes_distribution( \
                input_data,
                missing_strategy=missing_strategy, method=method)

        return votes.combine_to_distribution(normalize=False)

    def _get_models(self, models_split):
        if not isinstance(models_split[0], Model):
            if self.cache_get is not None and \
                    hasattr(self.cache_get, '__call__'):
                # retrieve the models from a cache get function
                try:
                    models = [
                        self.cache_get(model_id) for model_id in models_split
                    ]
                except Exception as exc:
                    raise Exception('Error while calling the '
                                    'user-given'
                                    ' function %s: %s' %
                                    (self.cache_get.__name__, str(exc)))
            else:
                models = [
                    retrieve_resource(self.api,
                                      model_id,
                                      query_string=ONLY_MODEL)
                    for model_id in models_split
                ]

        return models

    def _sort_predictions(self, a, b, criteria):
        """Sorts the categories in the predicted node according to the
        given criteria

        """
        if a[criteria] == b[criteria]:
            return sort_categories(a, b, self.objective_categories)
        return 1 if b[criteria] > a[criteria] else -1

    def predict_operating(self,
                          input_data,
                          missing_strategy=LAST_PREDICTION,
                          operating_point=None):
        """Computes the prediction based on a user-given operating point.

        """
        kind, threshold, positive_class = parse_operating_point( \
            operating_point, OPERATING_POINT_KINDS, self.class_names)

        try:
            predict_method = None
            predict_method = getattr(self, "predict_%s" % kind)

            predictions = predict_method(input_data, missing_strategy, False)
            position = self.class_names.index(positive_class)
        except KeyError:
            raise ValueError("The operating point needs to contain a valid"
                             " positive class, kind and a threshold.")

        if self.regression:
            prediction = predictions
        else:
            position = self.class_names.index(positive_class)
            if predictions[position][kind] > threshold:
                prediction = predictions[position]
            else:
                # if the threshold is not met, the alternative class with
                # highest probability or confidence is returned
                predictions.sort( \
                    key=cmp_to_key( \
                    lambda a, b: self._sort_predictions(a, b, kind)))
                prediction = predictions[0:2]
                if prediction[0]["category"] == positive_class:
                    prediction = prediction[1]
                else:
                    prediction = prediction[0]
            prediction["prediction"] = prediction["category"]
            del prediction["category"]
        return prediction

    def predict_operating_kind(self,
                               input_data,
                               missing_strategy=LAST_PREDICTION,
                               operating_kind=None):
        """Computes the prediction based on a user-given operating kind,
        i.e, confidence, probability or votes.

        """

        kind = operating_kind.lower()
        if self.boosting and kind != "probability":
            raise ValueError("Only probability is allowed as operating kind"
                             " for boosted ensembles.")
        if kind not in OPERATING_POINT_KINDS:
            raise ValueError("Allowed operating kinds are %s. %s found." %
                             (", ".join(OPERATING_POINT_KINDS), kind))

        try:
            predict_method = None
            predict_method = getattr(self, "predict_%s" % kind)

            predictions = predict_method(input_data, missing_strategy, False)
        except KeyError:
            raise ValueError("The operating kind needs to contain a valid"
                             " property.")

        if self.regression:
            prediction = predictions
        else:
            predictions.sort( \
                key=cmp_to_key( \
                lambda a, b: self._sort_predictions(a, b, kind)))
            prediction = predictions[0]
            prediction["prediction"] = prediction["category"]
            del prediction["category"]
        return prediction

    def predict(self,
                input_data,
                method=None,
                options=None,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                operating_kind=None,
                median=False,
                full=False):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param method: **deprecated**. Please check the `operating_kind`
                       attribute. Numeric key code for the following
                       combination methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        :param options: Options to be used in threshold filtered votes.
        :param missing_strategy: numeric key for the individual model's
                                 prediction method. See the model predict
                                 method.
        :param operating_point: In classification models, this is the point of
                                the ROC curve where the model will be used at.
                                The operating point can be defined in terms of:
                                  - the positive_class, the class that is
                                    important to predict accurately
                                  - its kind: probability, confidence or voting
                                  - its threshold: the minimum established
                                    for the positive_class to be predicted.
                                    The operating_point is then defined as a
                                    map with three attributes, e.g.:
                                       {"positive_class": "Iris-setosa",
                                        "kind": "probability",
                                        "threshold": 0.5}
        :param operating_kind: "probability", "confidence" or "votes". Sets the
                               property that decides the prediction.
                               Used only if no operating_point is used
        :param median: Uses the median of each individual model's predicted
                       node as individual prediction for the specified
                       combination method.
        :param full: Boolean that controls whether to include the prediction's
                     attributes. By default, only the prediction is produced.
                     If set to True, the rest of available information is
                     added in a dictionary format. The dictionary keys can be:
                      - prediction: the prediction value
                      - confidence: prediction's confidence
                      - probability: prediction's probability
                      - path: rules that lead to the prediction
                      - count: number of training instances supporting the
                               prediction
                      - next: field to check in the next split
                      - min: minim value of the training instances in the
                             predicted node
                      - max: maximum value of the training instances in the
                             predicted node
                      - median: median of the values of the training instances
                                in the predicted node
                      - unused_fields: list of fields in the input data that
                                       are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        unused_fields = None
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if median and method is None:
            # predictions with median are only available with old combiners
            method = PLURALITY_CODE

        if method is None and operating_point is None and \
            operating_kind is None and not median:
            # operating_point has precedence over operating_kind. If no
            # combiner is set, default operating kind is "probability"
            operating_kind = "probability"

        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data,
                missing_strategy=missing_strategy,
                operating_point=operating_point)
            if full:
                return prediction
            return prediction["prediction"]

        if operating_kind:
            if self.regression:
                # for regressions, operating_kind defaults to the old
                # combiners
                method = 1 if operating_kind == "confidence" else 0
                return self.predict( \
                    input_data, method=method,
                    options=options, missing_strategy=missing_strategy,
                    operating_point=None, operating_kind=None, full=full)
            prediction = self.predict_operating_kind( \
                input_data,
                missing_strategy=missing_strategy,
                operating_kind=operating_kind)
            return prediction

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([], boosting_offsets=self.boosting_offsets)

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields)

                votes_split = multi_model._generate_votes(
                    input_data,
                    missing_strategy=missing_strategy,
                    unused_fields=unused_fields)
                if median:
                    for prediction in votes_split.predictions:
                        prediction['prediction'] = prediction['median']
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model._generate_votes(
                input_data,
                missing_strategy=missing_strategy,
                unused_fields=unused_fields)

            votes = MultiVote(votes_split.predictions,
                              boosting_offsets=self.boosting_offsets)
            if median:
                for prediction in votes.predictions:
                    prediction['prediction'] = prediction['median']

        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}
        result = votes.combine(method=method, options=options, full=full)
        if full:
            unused_fields = set(input_data.keys())
            for prediction in votes.predictions:
                unused_fields = unused_fields.intersection( \
                    set(prediction.get("unused_fields", [])))
            if not isinstance(result, dict):
                result = {"prediction": result}
            result['unused_fields'] = list(unused_fields)

        return result

    def field_importance_data(self):
        """Computes field importance based on the field importance information
           of the individual models in the ensemble.

        """
        field_importance = {}
        field_names = {}
        if self.importance:
            field_importance = self.importance
            field_names = {field_id: {'name': self.fields[field_id]["name"]} \
                           for field_id in list(field_importance.keys())}
            return [list(importance) for importance in \
                sorted(list(field_importance.items()), key=lambda x: x[1],
                       reverse=True)], field_names

        if (self.distributions is not None
                and isinstance(self.distributions, list)
                and all('importance' in item for item in self.distributions)):
            # Extracts importance from ensemble information
            importances = [
                model_info['importance'] for model_info in self.distributions
            ]
            for index in range(0, len(importances)):
                model_info = importances[index]
                for field_info in model_info:
                    field_id = field_info[0]
                    if field_id not in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]
        else:
            # Old ensembles, extracts importance from model information
            for model_id in self.model_ids:
                local_model = BaseModel(model_id, api=self.api)
                for field_info in local_model.field_importance:
                    field_id = field_info[0]
                    if field_info[0] not in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]

        number_of_models = len(self.model_ids)
        for field_id in field_importance:
            field_importance[field_id] /= number_of_models
        return [list(importance) for importance in \
            sorted(list(field_importance.items()), key=lambda x: x[1],
                   reverse=True)], field_names

    def print_importance(self, out=sys.stdout):
        """Prints ensemble field importance

        """
        print_importance(self, out=out)

    def get_data_distribution(self, distribution_type="training"):
        """Returns the required data distribution by adding the distributions
           in the models

        """
        ensemble_distribution = []
        categories = []
        distribution = []
        # ensembles have now the field information
        if self.distribution and self.boosting:
            return sorted(self.distribution, key=lambda x: x[0])

        for model_distribution in self.distributions:
            summary = model_distribution[distribution_type]
            if 'bins' in summary:
                distribution = summary['bins']
            elif 'counts' in summary:
                distribution = summary['counts']
            elif 'categories' in summary:
                distribution = summary['categories']
            else:
                distribution = []
            for point, instances in distribution:
                if point in categories:
                    ensemble_distribution[categories.index(
                        point)][1] += instances
                else:
                    categories.append(point)
                    ensemble_distribution.append([point, instances])

        return sorted(ensemble_distribution, key=lambda x: x[0])

    def summarize(self, out=sys.stdout):
        """Prints ensemble summary. Only field importance at present.

        """
        distribution = self.get_data_distribution("training")

        if distribution:
            out.write("Data distribution:\n")
            print_distribution(distribution, out=out)
            out.write("\n\n")

        if not self.boosting:
            predictions = self.get_data_distribution("predictions")

            if predictions:
                out.write("Predicted distribution:\n")
                print_distribution(predictions, out=out)
                out.write("\n\n")

        out.write("Field importance:\n")
        self.print_importance(out=out)
        out.flush()

    def all_model_fields(self, max_models=None):
        """Retrieves the fields used as predictors in all the ensemble
           models

        """

        fields = {}
        models = []
        objective_id = None
        no_objective_id = False
        if isinstance(self.models_splits[0][0], Model):
            for split in self.models_splits:
                models.extend(split)
        else:
            models = self.model_ids
        for index, model_id in enumerate(models):
            if isinstance(model_id, Model):
                local_model = model_id
            elif self.cache_get is not None:
                local_model = self.cache_get(model_id)
            else:
                local_model = Model(model_id, self.api)
            if (max_models is not None and index > 0
                    and index % max_models == 0):
                gc.collect()
            fields.update(local_model.fields)
            if (objective_id is not None
                    and objective_id != local_model.objective_id):
                # the models' objective field have different ids, no global id
                no_objective_id = True
            else:
                objective_id = local_model.objective_id
        if no_objective_id:
            objective_id = None
        gc.collect()
        return fields, objective_id

    def dump(self, output=None, cache_set=None):
        """Uses msgpack to serialize the resource object
        If cache_set is filled with a cache set method, the method is called

        """
        self_vars = vars(self)
        del self_vars["api"]
        if "multi_model" in self_vars:
            for model in self_vars["multi_model"].models:
                model.dump(output=output, cache_set=cache_set)
            del self_vars["multi_model"]
        dump(self_vars, output=output, cache_set=cache_set)

    def dumps(self):
        """Uses msgpack to serialize the resource object to a string

        """
        self_vars = vars(self)
        del self_vars["api"]
        if "multi_model" in self_vars:
            del self_vars["multi_model"]
        dumps(self_vars)
Beispiel #17
0
    def predict(self,
                input_data,
                method=None,
                options=None,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                operating_kind=None,
                median=False,
                full=False):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param method: **deprecated**. Please check the `operating_kind`
                       attribute. Numeric key code for the following
                       combination methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        :param options: Options to be used in threshold filtered votes.
        :param missing_strategy: numeric key for the individual model's
                                 prediction method. See the model predict
                                 method.
        :param operating_point: In classification models, this is the point of
                                the ROC curve where the model will be used at.
                                The operating point can be defined in terms of:
                                  - the positive_class, the class that is
                                    important to predict accurately
                                  - its kind: probability, confidence or voting
                                  - its threshold: the minimum established
                                    for the positive_class to be predicted.
                                    The operating_point is then defined as a
                                    map with three attributes, e.g.:
                                       {"positive_class": "Iris-setosa",
                                        "kind": "probability",
                                        "threshold": 0.5}
        :param operating_kind: "probability", "confidence" or "votes". Sets the
                               property that decides the prediction.
                               Used only if no operating_point is used
        :param median: Uses the median of each individual model's predicted
                       node as individual prediction for the specified
                       combination method.
        :param full: Boolean that controls whether to include the prediction's
                     attributes. By default, only the prediction is produced.
                     If set to True, the rest of available information is
                     added in a dictionary format. The dictionary keys can be:
                      - prediction: the prediction value
                      - confidence: prediction's confidence
                      - probability: prediction's probability
                      - path: rules that lead to the prediction
                      - count: number of training instances supporting the
                               prediction
                      - next: field to check in the next split
                      - min: minim value of the training instances in the
                             predicted node
                      - max: maximum value of the training instances in the
                             predicted node
                      - median: median of the values of the training instances
                                in the predicted node
                      - unused_fields: list of fields in the input data that
                                       are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        unused_fields = None
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if median and method is None:
            # predictions with median are only available with old combiners
            method = PLURALITY_CODE

        if method is None and operating_point is None and \
            operating_kind is None and not median:
            # operating_point has precedence over operating_kind. If no
            # combiner is set, default operating kind is "probability"
            operating_kind = "probability"

        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data,
                missing_strategy=missing_strategy,
                operating_point=operating_point)
            if full:
                return prediction
            return prediction["prediction"]

        if operating_kind:
            if self.regression:
                # for regressions, operating_kind defaults to the old
                # combiners
                method = 1 if operating_kind == "confidence" else 0
                return self.predict( \
                    input_data, method=method,
                    options=options, missing_strategy=missing_strategy,
                    operating_point=None, operating_kind=None, full=full)
            prediction = self.predict_operating_kind( \
                input_data,
                missing_strategy=missing_strategy,
                operating_kind=operating_kind)
            return prediction

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([], boosting_offsets=self.boosting_offsets)

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields)

                votes_split = multi_model._generate_votes(
                    input_data,
                    missing_strategy=missing_strategy,
                    unused_fields=unused_fields)
                if median:
                    for prediction in votes_split.predictions:
                        prediction['prediction'] = prediction['median']
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model._generate_votes(
                input_data,
                missing_strategy=missing_strategy,
                unused_fields=unused_fields)

            votes = MultiVote(votes_split.predictions,
                              boosting_offsets=self.boosting_offsets)
            if median:
                for prediction in votes.predictions:
                    prediction['prediction'] = prediction['median']

        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}
        result = votes.combine(method=method, options=options, full=full)
        if full:
            unused_fields = set(input_data.keys())
            for prediction in votes.predictions:
                unused_fields = unused_fields.intersection( \
                    set(prediction.get("unused_fields", [])))
            if not isinstance(result, dict):
                result = {"prediction": result}
            result['unused_fields'] = list(unused_fields)

        return result
Beispiel #18
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct))

    max_models = args.max_batch_models
    label_separator = args.label_separator
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" %
                             (model, str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data['generated_fields'][
                    objective_column]
                labels_columns = [
                    label_info[1] for label_info in labels_info
                    if label_info[0] in labels
                ]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            try:
                local_model.batch_predict(
                    input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes
Beispiel #19
0
class Ensemble(object):
    """A local predictive Ensemble.

       Uses a number of BigML remote models to build an ensemble local version
       that can be used to generate predictions locally.

    """
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [
                models[index:(index + max_models)]
                for index in range(0, number_of_models, max_models)
            ]
        if len(self.models_splits) == 1:
            models = [
                retrieve_resource(self.api, model_id, query_string=ONLY_MODEL)
                for model_id in self.models_splits[0]
            ]
            self.multi_model = MultiModel(models, self.api)

    def list_models(self):
        """Lists all the model/ids that compound the ensemble.

        """
        return self.model_ids

    def predict(self,
                input_data,
                by_name=True,
                method=PLURALITY_CODE,
                with_confidence=False,
                options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [
                    retrieve_resource(self.api,
                                      model_id,
                                      query_string=ONLY_MODEL)
                    for model_id in models_split
                ]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method,
                             with_confidence=with_confidence,
                             options=options)

    def field_importance_data(self):
        """Computes field importance based on the field importance information
           of the individual models in the ensemble.

        """
        field_importance = {}
        field_names = {}
        if (self.distributions is not None
                and isinstance(self.distributions, list)
                and all('importance' in item for item in self.distributions)):
            # Extracts importance from ensemble information
            importances = [
                model_info['importance'] for model_info in self.distributions
            ]
            for index in range(0, len(importances)):
                model_info = importances[index]
                for field_info in model_info:
                    field_id = field_info[0]
                    if not field_id in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]
        else:
            # Old ensembles, extracts importance from model information
            for model_id in self.model_ids:
                local_model = BaseModel(model_id, api=self.api)
                for field_info in local_model.field_importance:
                    field_id = field_info[0]
                    if not field_info[0] in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]

        number_of_models = len(self.model_ids)
        for field_id in field_importance.keys():
            field_importance[field_id] /= number_of_models
        return map(
            list,
            sorted(field_importance.items(), key=lambda x: x[1],
                   reverse=True)), field_names

    def print_importance(self, out=sys.stdout):
        """Prints ensemble field importance

        """
        print_importance(self, out=out)

    def get_data_distribution(self, distribution_type="training"):
        """Returns the required data distribution by adding the distributions
           in the models

        """
        ensemble_distribution = []
        categories = []
        for model_distribution in self.distributions:
            summary = model_distribution[distribution_type]
            if 'bins' in summary:
                distribution = summary['bins']
            elif 'counts' in summary:
                distribution = summary['counts']
            elif 'categories' in summary:
                distribution = summary['categories']
            for point, instances in distribution:
                if point in categories:
                    ensemble_distribution[categories.index(
                        point)][1] += instances
                else:
                    categories.append(point)
                    ensemble_distribution.append([point, instances])

        return sorted(ensemble_distribution, key=lambda x: x[0])

    def summarize(self, out=sys.stdout):
        """Prints ensemble summary. Only field importance at present.

        """
        distribution = self.get_data_distribution("training")

        out.write(u"Data distribution:\n")
        print_distribution(distribution, out=out)
        out.write(u"\n\n")

        predictions = self.get_data_distribution("predictions")

        out.write(u"Predicted distribution:\n")
        print_distribution(predictions, out=out)
        out.write(u"\n\n")

        out.write(u"Field importance:\n")
        self.print_importance(out=out)
        out.flush()

    def all_model_fields(self):
        """Retrieves the fields used as predictors in all the ensemble
           models

        """

        fields = {}
        for model_id in self.model_ids:
            local_model = Model(model_id, self.api)
            gc.collect()
            fields.update(local_model.fields)
        return fields
Beispiel #20
0
def local_batch_predict(models, test_reader, prediction_file, api, args,
                        resume=False, output_path=None, output=None,
                        method=PLURALITY_CODE, options=None,
                        session_file=None, labels=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """

    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct), reset=True)

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split, api, query_string=query_string, labels=labels,
            multi_label_data=multi_label_data, ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list, output_path, by_name=test_set_header,
                    reuse=True, missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers, to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [multivote.predictions[0]['prediction'],
                          multivote.predictions[0]['confidence']]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote, options, labels, models_per_label, ordered,
                models_order, label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method, with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
def i_create_a_local_multi_model(step):
    world.local_model = MultiModel(world.list_of_models)
Beispiel #22
0
def local_batch_predict(models,
                        test_reader,
                        prediction_file,
                        api,
                        args,
                        resume=False,
                        output_path=None,
                        output=None,
                        method=PLURALITY_CODE,
                        options=None,
                        session_file=None,
                        labels=None,
                        ordered=True,
                        exclude=None,
                        models_per_label=1,
                        other_label=OTHER,
                        multi_label_data=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" %
                    (localize(current), localize(total), pct),
                    reset=True)

    max_models = args.max_batch_models
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [
        models[index:(index + max_models)]
        for index in range(0, models_total, max_models)
    ]
    # Input data is stored as a list and predictions are made for all rows
    # with each model
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
    total_votes = []
    models_order = []
    models_count = 0
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    # processing the models in slots
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model, output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(),
                             debug=args.debug)
        # retrieving the full models allowed by --max-batch-models to be used
        # in a multimodel slot
        complete_models, models_order = retrieve_models_split(
            models_split,
            api,
            query_string=query_string,
            labels=labels,
            multi_label_data=multi_label_data,
            ordered=ordered,
            models_order=models_order)

        # predicting with the multimodel slot
        if complete_models:
            local_model = MultiModel(complete_models, api=api)
            # added to ensure garbage collection at each step of the loop
            gc.collect()
            try:
                votes = local_model.batch_predict(
                    raw_input_data_list,
                    output_path,
                    by_name=test_set_header,
                    reuse=True,
                    missing_strategy=args.missing_strategy,
                    headers=test_reader.raw_headers,
                    to_file=(not args.fast),
                    use_median=args.median)
            except ImportError:
                sys.exit("Failed to find the numpy and scipy libraries needed"
                         " to use proportional missing strategy for"
                         " regressions. Please, install them manually")

            # extending the votes for each input data with the new model-slot
            # predictions
            if not args.fast:
                votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if args.verbosity:
                draw_progress_bar(models_count, models_total)

            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes

    if not single_model:
        message = u.dated("Combining predictions.\n")
        u.log_message(message, log_file=session_file, console=args.verbosity)

    # combining the votes to issue the final prediction for each input data
    for index in range(0, len(total_votes)):
        multivote = total_votes[index]
        input_data = raw_input_data_list[index]

        if single_model:
            # single model predictions need no combination
            prediction = [
                multivote.predictions[0]['prediction'],
                multivote.predictions[0]['confidence']
            ]
        elif method == AGGREGATION:
            # multi-labeled fields: predictions are concatenated
            prediction = aggregate_multivote(
                multivote,
                options,
                labels,
                models_per_label,
                ordered,
                models_order,
                label_separator=args.label_separator)
        elif method == COMBINATION:
            # used in --max-categories flag: each model slot contains a
            # subset of categories and the predictions for all of them
            # are combined in a global distribution to obtain the final
            # prediction
            prediction = combine_multivote(multivote, other_label=other_label)
        else:
            prediction = multivote.combine(method=method,
                                           with_confidence=True,
                                           options=options)

        write_prediction(prediction, output, args.prediction_info, input_data,
                         exclude)
Beispiel #23
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE, options=None,
                        session_file=None, debug=False,
                        prediction_info=NORMAL_FORMAT,
                        labels=None, label_separator=None, ordered=True,
                        exclude=None, models_per_label=1, other_label=OTHER,
                        multi_label_data=None):

    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))
    if labels is None:
        labels = []
    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]
    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    if not ordered:
        models_order = []
    single_model = models_total == 1
    query_string = FIELDS_QS if single_model else ALL_FIELDS_QS
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []

        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model,
                                             query_string)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s. %s" % (model,
                                                              str(exception)))
            # When user selects the labels in multi-label predictions, we must
            # filter the models that will be used to predict
            if labels:
                objective_column = str(multi_label_data['objective_column'])
                labels_info = multi_label_data[
                    'generated_fields'][objective_column]
                labels_columns = [label_info[1] for label_info in labels_info
                                  if label_info[0] in labels]
                model_objective_id = model['object']['objective_fields'][0]
                model_fields = model['object']['model']['fields']
                model_objective = model_fields[model_objective_id]
                model_column = model_objective['column_number']
                if (model_column in labels_columns):
                    # When the list of models comes from a --model-tag
                    # selection, the models are not retrieved in the same
                    # order they were created. We must keep track of the
                    # label they are associated with to label their
                    # predictions properly
                    if not ordered:
                        models_order.append(model_column)
                    complete_models.append(model)
            else:
                complete_models.append(model)

        if complete_models:
            local_model = MultiModel(complete_models)
            local_model.batch_predict(input_data_list,
                                      output_path,
                                      by_name=test_set_header,
                                      reuse=True)
            votes = local_model.batch_votes(output_path)
            models_count += max_models
            if models_count > models_total:
                models_count = models_total
            if verbosity:
                draw_progress_bar(models_count, models_total)
            if total_votes:
                for index in range(0, len(votes)):
                    predictions = total_votes[index]
                    predictions.extend(votes[index].predictions)
            else:
                total_votes = votes
Beispiel #24
0
class Ensemble(object):
    """A local predictive Ensemble.

       Uses a number of BigML remote models to build an ensemble local version
       that can be used to generate predictions locally.

    """

    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_resource(self.api, model_id,
                                        query_string=ONLY_MODEL)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models, self.api)

    def list_models(self):
        """Lists all the model/ids that compound the ensemble.

        """
        return self.model_ids

    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)

    def field_importance_data(self):
        """Computes field importance based on the field importance information
           of the individual models in the ensemble.

        """
        field_importance = {}
        field_names = {}
        if (self.distributions is not None and
                isinstance(self.distributions, list) and
                all('importance' in item for item in self.distributions)):
            # Extracts importance from ensemble information
            importances = [model_info['importance'] for model_info in
                           self.distributions]
            for index in range(0, len(importances)):
                model_info = importances[index]
                for field_info in model_info:
                    field_id = field_info[0]
                    if not field_id in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]
        else:
            # Old ensembles, extracts importance from model information
            for model_id in self.model_ids:
                local_model = BaseModel(model_id, api=self.api)
                for field_info in local_model.field_importance:
                    field_id = field_info[0]
                    if not field_info[0] in field_importance:
                        field_importance[field_id] = 0.0
                        name = self.fields[field_id]['name']
                        field_names[field_id] = {'name': name}
                    field_importance[field_id] += field_info[1]

        number_of_models = len(self.model_ids)
        for field_id in field_importance.keys():
            field_importance[field_id] /= number_of_models
        return map(list, sorted(field_importance.items(), key=lambda x: x[1],
                                reverse=True)), field_names

    def print_importance(self, out=sys.stdout):
        """Prints ensemble field importance

        """
        print_importance(self, out=out)

    def get_data_distribution(self, distribution_type="training"):
        """Returns the required data distribution by adding the distributions
           in the models

        """
        ensemble_distribution = []
        categories = []
        for model_distribution in self.distributions:
            summary = model_distribution[distribution_type]
            if 'bins' in summary:
                distribution = summary['bins']
            elif 'counts' in summary:
                distribution = summary['counts']
            elif 'categories' in summary:
                distribution = summary['categories']
            for point, instances in distribution:
                if point in categories:
                    ensemble_distribution[
                        categories.index(point)][1] += instances
                else:
                    categories.append(point)
                    ensemble_distribution.append([point, instances])

        return sorted(ensemble_distribution, key=lambda x: x[0])

    def summarize(self, out=sys.stdout):
        """Prints ensemble summary. Only field importance at present.

        """
        distribution = self.get_data_distribution("training")

        out.write(u"Data distribution:\n")
        print_distribution(distribution, out=out)
        out.write(u"\n\n")

        predictions = self.get_data_distribution("predictions")

        out.write(u"Predicted distribution:\n")
        print_distribution(predictions, out=out)
        out.write(u"\n\n")

        out.write(u"Field importance:\n")
        self.print_importance(out=out)
        out.flush()

    def all_model_fields(self):
        """Retrieves the fields used as predictors in all the ensemble
           models

        """

        fields = {}
        for model_id in self.model_ids:
            local_model = Model(model_id, self.api)
            gc.collect()
            fields.update(local_model.fields)
        return fields
Beispiel #25
0
def local_batch_predict(models, test_reader, prediction_file, api,
                        max_models=MAX_MODELS,
                        resume=False, output_path=None, output=None,
                        verbosity=True, method=PLURALITY_CODE,
                        session_file=None, debug=False, prediction_info=None):
    """Get local predictions form partial Multimodel, combine and save to file

    """
    def draw_progress_bar(current, total):
        """Draws a text based progress report.

        """
        pct = 100 - ((total - current) * 100) / (total)
        console_log("Predicted on %s out of %s models [%s%%]" % (
            localize(current), localize(total), pct))

    test_set_header = test_reader.has_headers()
    if output_path is None:
        output_path = u.check_dir(prediction_file)
    if output is None:
        try:
            output = open(prediction_file, 'w', 0)
        except IOError:
            raise IOError("Failed to write in %s" % prediction_file)
    models_total = len(models)
    models_splits = [models[index:(index + max_models)] for index
                     in range(0, models_total, max_models)]

    input_data_list = []
    raw_input_data_list = []
    for input_data in test_reader:
        raw_input_data_list.append(input_data)
        input_data_list.append(test_reader.dict(input_data))
    total_votes = []
    models_count = 0
    for models_split in models_splits:
        if resume:
            for model in models_split:
                pred_file = get_predictions_file_name(model,
                                                      output_path)
                c.checkpoint(c.are_predictions_created,
                             pred_file,
                             test_reader.number_of_tests(), debug=debug)
        complete_models = []
        for index in range(len(models_split)):
            model = models_split[index]
            if (isinstance(model, basestring) or
                    bigml.api.get_status(model)['code'] != bigml.api.FINISHED):
                try:
                    model = u.check_resource(model, api.get_model, FIELDS_QS)
                except ValueError, exception:
                    sys.exit("Failed to get model: %s" % (model,
                                                          str(exception)))
            complete_models.append(model)

        local_model = MultiModel(complete_models)
        local_model.batch_predict(input_data_list,
                                  output_path,
                                  by_name=test_set_header,
                                  reuse=True)
        votes = local_model.batch_votes(output_path)
        models_count += max_models
        if models_count > models_total:
            models_count = models_total
        if verbosity:
            draw_progress_bar(models_count, models_total)
        if total_votes:
            for index in range(0, len(votes)):
                predictions = total_votes[index].predictions
                predictions.extend(votes[index].predictions)
        else:
            total_votes = votes
Beispiel #26
0
class Ensemble(object):
    """A local predictive Ensemble.

       Uses a number of BigML remote models to build an ensemble local version
       that can be used to generate predictions locally.

    """

    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
        self.model_ids = models
        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_model(self.api, model_id)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models)

    def list_models(self):
        """Lists all the model/ids that compound the ensemble.

        """
        return self.model_ids

    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_model(self.api, model_id)
                          for model_id in models_split]
                multi_model = MultiModel(models)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)

    def field_importance_data(self):
        """Computes field importance based on the field importance information
           of the individual models in the ensemble.

        """
        field_importance = {}
        field_names = {}
        for model_id in self.model_ids:
            local_model = BaseModel(model_id)
            for field_info in local_model.field_importance:
                field_id = field_info[0]
                if not field_info[0] in field_importance:
                    field_importance[field_id] = 0.0
                    name = local_model.fields[field_id]['name']
                    field_names[field_id] = {'name': name}
                field_importance[field_id] += field_info[1]
        number_of_models = len(self.model_ids)
        for field_id in field_importance.keys():
            field_importance[field_id] /= number_of_models
        return map(list, sorted(field_importance.items(), key=lambda x: x[1],
                                reverse=True)), field_names

    def print_importance(self, out=sys.stdout):
        """Prints ensemble field importance

        """
        print_importance(self, out=out)