Example #1
0
    def centroid(self, input_data, by_name=True):
        """Returns the id of the nearest centroid

        """
        # Checks and cleans input_data leaving the fields used in the model
        clean_input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data and
        # fills them with the default average (if given) when otherwise
        try:
            self.fill_numeric_defaults(clean_input_data,
                                       self.default_numeric_value)
        except ValueError:
            raise Exception("Failed to predict a centroid. Input"
                            " data must contain values for all "
                            "numeric fields to find a centroid.")
        # Strips affixes for numeric values and casts to the final field type
        cast(clean_input_data, self.fields)

        unique_terms = self.get_unique_terms(clean_input_data)
        nearest = {'centroid_id': None, 'centroid_name': None,
                   'distance': float('inf')}
        for centroid in self.centroids:
            distance2 = centroid.distance2(clean_input_data, unique_terms,
                                           self.scales,
                                           stop_distance2=nearest['distance'])
            if distance2 is not None:
                nearest = {'centroid_id': centroid.centroid_id,
                           'centroid_name': centroid.name,
                           'distance': distance2}
        nearest['distance'] = math.sqrt(nearest['distance'])
        return nearest
Example #2
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #3
0
    def anomaly_score(self, input_data, by_name=True):
        """Returns the anomaly score given by the iforest

            To produce an anomaly score, we evaluate each tree in the iforest
            for its depth result (see the depth method in the AnomalyTree
            object for details). We find the average of these depths
            to produce an `observed_mean_depth`. We calculate an
            `expected_mean_depth` using the `sample_size` and `mean_depth`
            parameters which come as part of the forest message.
            We combine those values as seen below, which should result in a
            value between 0 and 1.

        """

        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        depth_sum = 0
        if self.iforest is None:
            raise Exception("We could not find the iforest information to "
                            "compute the anomaly score. Please, rebuild your "
                            "Anomaly object from a complete anomaly detector "
                            "resource.")
        for tree in self.iforest:
            depth_sum += tree.depth(input_data)[0]
        observed_mean_depth = float(depth_sum) / len(self.iforest)
        return math.pow(2, - observed_mean_depth / self.expected_mean_depth)
Example #4
0
    def centroid(self, input_data, by_name=True):
        """Returns the id of the nearest centroid

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data
        for field_id, field in self.fields.items():
            if (field['optype'] not in OPTIONAL_FIELDS and
                    field_id not in input_data):
                raise Exception("Failed to predict a centroid. Input"
                                " data must contain values for all "
                                "numeric fields to find a centroid.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        unique_terms = self.get_unique_terms(input_data)
        nearest = {'centroid_id': None, 'centroid_name': None,
                   'distance': float('inf')}
        for centroid in self.centroids:
            distance2 = centroid.distance2(input_data, unique_terms,
                                           self.scales,
                                           stop_distance2=nearest['distance'])
            if distance2 is not None:
                nearest = {'centroid_id': centroid.centroid_id,
                           'centroid_name': centroid.name,
                           'distance': distance2}
        nearest['distance'] = math.sqrt(nearest['distance'])
        return nearest
Example #5
0
    def predict(self,
                input_data,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the threshold,
                           the value that is stablished
                           as minimum for the positive_class to be predicted.
                         - the kind of measure used to set a threshold:
                           probability or confidence (if available)
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "threshold": 0.5,
                            "kind": "probability"}
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.model_fields)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        full_prediction = self._predict( \
            input_data, missing_strategy=missing_strategy,
            operating_point=operating_point,
            unused_fields=unused_fields)
        if full:
            return dict((key, value) for key, value in \
                full_prediction.items() if value is not None)

        return full_prediction['prediction']
Example #6
0
    def anomaly_score(self, input_data):
        """Returns the anomaly score given by the iforest

            To produce an anomaly score, we evaluate each tree in the iforest
            for its depth result (see the depth method in the AnomalyTree
            object for details). We find the average of these depths
            to produce an `observed_mean_depth`. We calculate an
            `expected_mean_depth` using the `sample_size` and `mean_depth`
            parameters which come as part of the forest message.
            We combine those values as seen below, which should result in a
            value between 0 and 1.

        """

        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        depth_sum = 0
        if self.iforest is None:
            raise Exception("We could not find the iforest information to "
                            "compute the anomaly score. Please, rebuild your "
                            "Anomaly object from a complete anomaly detector "
                            "resource.")
        for tree in self.iforest:
            depth_sum += tree.depth(input_data)[0]
        observed_mean_depth = float(depth_sum) / len(self.iforest)
        return math.pow(2, - observed_mean_depth / self.expected_mean_depth)
Example #7
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL and
                not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #8
0
    def predict(self, input_data, by_name=True, add_unused_fields=False):
        """Makes a prediction based on a number of field values.


        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        input_array = self.fill_array(input_data, unique_terms)

        if self.networks:
            return self.predict_list(input_array)
        else:
            return self.predict_single(input_array)
Example #9
0
    def centroid(self, input_data, by_name=True):
        """Returns the id of the nearest centroid

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data
        for field_id, field in self.fields.items():
            if (not field['optype'] in OPTIONAL_FIELDS and
                    not field_id in input_data):
                raise Exception("Failed to predict a centroid. Input"
                                " data must contain values for all "
                                "numeric fields to find a centroid.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        unique_terms = self.get_unique_terms(input_data)
        nearest = {'centroid_id': None, 'centroid_name': None,
                   'distance': float('inf')}
        for centroid in self.centroids:
            distance2 = centroid.distance2(input_data, unique_terms,
                                           self.scales,
                                           stop_distance2=nearest['distance'])
            if distance2 is not None:
                nearest = {'centroid_id': centroid.centroid_id,
                           'centroid_name': centroid.name,
                           'distance': distance2}
        nearest['distance'] = math.sqrt(nearest['distance'])
        return nearest
Example #10
0
    def predict(self, input_data, by_name=True, add_unused_fields=False,
                operating_point=None):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        add_unused_fields: Boolean, if True adds the information about the
                           fields in the input_data that are not being used
                           in the model as predictors.
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # When operating_point is used, we need the probabilities
        # of all possible classes to decide, so se use
        # the `predict_probability` method
        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data, by_name=False, operating_point=operating_point)
            return prediction

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        input_array = self.fill_array(input_data, unique_terms)

        if self.networks:
            prediction = self.predict_list(input_array)
        else:
            prediction = self.predict_single(input_array)
        if add_unused_fields:
            prediction.update({"unused_fields": unused_fields})

        return prediction
Example #11
0
    def predict(self, input_data, by_name=True):
        """Returns the class prediction and the probability distribution

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS and
                        not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if self.balance_fields:
            for field in input_data:
                if self.fields[field]['optype'] == 'numeric':
                    mean = self.fields[field]['summary']['mean']
                    stddev = self.fields[field]['summary'][ \
                        'standard_deviation']
                    input_data[field] = (input_data[field] - mean) / stddev

        # Compute text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        for category in self.coefficients.keys():
            probability = self.category_probability( \
                input_data, unique_terms, category)
            order = self.categories[self.objective_id].index(category)
            probabilities[category] = {"category": category,
                                       "probability": probability,
                                       "order": order}
            total += probabilities[category]["probability"]
        for category in probabilities.keys():
            probabilities[category]["probability"] /= total
        predictions = sorted(probabilities.items(),
                             key=lambda x: (x[1]["probability"],
                                            - x[1]["order"]), reverse=True)
        for prediction, probability in predictions:
          del probability['order']
        prediction, probability = predictions[0]
        return {
            "prediction": prediction,
            "probability": probability["probability"],
            "distribution": [{"category": category,
                              "probability": probability["probability"]}
                             for category, probability in predictions]}
Example #12
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        output = prediction
        if with_confidence:
            output = [prediction, confidence, distribution, instances]
        if add_confidence or add_path or add_distribution or add_count:
            output = {'prediction': prediction}
            if add_confidence:
                output.update({'confidence': confidence})
            if add_path:
                rules = path
                output.update({'path': rules})
            if add_distribution:
                output.update({'distribution': distribution})
            if add_count:
                output.update({'count': instances})

        return output
Example #13
0
    def predict(self, input_data, missing_strategy=LAST_PREDICTION,
                operating_point=None, full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.fields)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        full_prediction = self._predict( \
            input_data, missing_strategy=missing_strategy,
            operating_point=operating_point,
            unused_fields=unused_fields)
        if full:
            return dict((key, value) for key, value in \
                full_prediction.iteritems() if value is not None)

        return full_prediction['prediction']
Example #14
0
    def predict(self, input_data, by_name=True):
        """Returns the class prediction and the probability distribution

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS
                        and not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # Compute text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        for category in self.coefficients.keys():
            probability = self.category_probability( \
                input_data, unique_terms, category)
            order = self.categories[self.objective_id].index(category)
            probabilities[category] = {
                "category": category,
                "probability": probability,
                "order": order
            }
            total += probabilities[category]["probability"]
        for category in probabilities.keys():
            probabilities[category]["probability"] /= total
        predictions = sorted(probabilities.items(),
                             key=lambda x:
                             (x[1]["probability"], -x[1]["order"]),
                             reverse=True)
        for prediction, probability in predictions:
            del probability['order']
        prediction, probability = predictions[0]
        return {
            "prediction":
            prediction,
            "probability":
            probability["probability"],
            "distribution": [{
                "category": category,
                "probability": probability["probability"]
            } for category, probability in predictions]
        }
Example #15
0
    def predict(self, input_data, full=False):
        """Returns the prediction and the confidence intervals

        input_data: Input data to be predicted
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        norm_input_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            norm_input_data, unused_fields = norm_input_data

        # Strips affixes for numeric values and casts to the final field type
        cast(norm_input_data, self.fields)

        # In case that the training data has no missings, input data shouldn't
        check_no_training_missings(norm_input_data, self.model_fields,
                                   self.weight_field,
                                   self.objective_id)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(norm_input_data)

        # Creates an input vector with the values for all expanded fields.
        input_array = self.expand_input(norm_input_data, unique_terms)
        compact_input_array = self.expand_input(norm_input_data, unique_terms,
                                                True)

        prediction = dot([flatten(self.coefficients)], [input_array])[0][0]

        result = {
            "prediction": prediction}
        if self.xtx_inverse:
            result.update({"confidence_bounds": self.confidence_bounds( \
                compact_input_array)})

        if full:
            result.update({"unused_fields": unused_fields})
        else:
            result = result["prediction"]

        return result
Example #16
0
    def _prepare_for_distance(self, input_data):
        """Prepares the fields to be able to compute the distance2

        """
        # Checks and cleans input_data leaving the fields used in the model
        # and adding default numeric values if set
        norm_input_data = self.filter_input_data(input_data)
        # Strips affixes for numeric values and casts to the final field type
        cast(norm_input_data, self.fields)

        unique_terms = self.get_unique_terms(norm_input_data)

        return norm_input_data, unique_terms
Example #17
0
    def predict(self, input_data, full=False):
        """Returns the prediction and the confidence intervals

        input_data: Input data to be predicted
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            new_data, unused_fields = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # In case that the training data has no missings, input data shouldn't
        check_no_training_missings(new_data, self.fields, self.weight_field,
                                   self.objective_id)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(new_data)

        # Creates an input vector with the values for all expanded fields.
        input_array = self.expand_input(new_data, unique_terms)
        compact_input_array = self.expand_input(new_data, unique_terms, True)

        prediction = dot([flatten(self.coefficients)], [input_array])[0][0]

        result = {
            "prediction": prediction}
        if self.xtx_inverse is not None:
            result.update({"confidence_bounds": self.confidence_bounds( \
                compact_input_array)})

        if full:
            result.update({"unused_fields": unused_fields})
        else:
            result = result["prediction"]

        return result
Example #18
0
    def centroid(self, input_data, by_name=True):
        """Returns the id of the nearest centroid

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data
        for field_id, field in self.fields.items():
            if (not field['optype'] in ['categorical', 'text'] and
                not field_id in input_data):
                raise Exception("Failed to predict a centroid. Input"
                                " data must contain values for all "
                                "numeric fields to find a centroid.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        unique_terms = {}
        for field_id in self.term_forms:
            if field_id in input_data:
                case_sensitive = self.term_analysis[field_id].get(
                    'case_sensitive', True)
                token_mode = self.term_analysis[field_id].get(
                    'token_mode', 'all')
                input_data_field = input_data.get(field_id, '')
                if token_mode != TM_FULL_TERM:
                    terms = parse_terms(input_data_field,
                                        case_sensitive=case_sensitive)
                else:
                    terms = []
                if token_mode != TM_TOKENS:
                    terms.append(input_data_field if case_sensitive else
                                 input_data_field.lower())
                unique_terms[field_id] = get_unique_terms(
                    terms, self.term_forms[field_id],
                    self.tag_clouds.get(field_id, []))
                del input_data[field_id]
        nearest = {'centroid_id': None, 'centroid_name': None,
                   'distance': float('inf')}
        for centroid in self.centroids:
            distance2 = centroid.distance2(input_data, unique_terms,
                                           self.scales,
                                           stop_distance2=nearest['distance'])
            if distance2 is not None:
                nearest = {'centroid_id': centroid.centroid_id,
                           'centroid_name': centroid.name,
                           'distance': distance2}
        nearest['distance'] = math.sqrt(nearest['distance'])
        return nearest
Example #19
0
    def centroid(self, input_data, by_name=True):
        """Returns the id of the nearest centroid

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data
        for field_id, field in self.fields.items():
            if (not field['optype'] in ['categorical', 'text'] and
                    not field_id in input_data):
                raise Exception("Failed to predict a centroid. Input"
                                " data must contain values for all "
                                "numeric fields to find a centroid.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        unique_terms = {}
        for field_id in self.term_forms:
            if field_id in input_data:
                case_sensitive = self.term_analysis[field_id].get(
                    'case_sensitive', True)
                token_mode = self.term_analysis[field_id].get(
                    'token_mode', 'all')
                input_data_field = input_data.get(field_id, '')
                if token_mode != TM_FULL_TERM:
                    terms = parse_terms(input_data_field,
                                        case_sensitive=case_sensitive)
                else:
                    terms = []
                if token_mode != TM_TOKENS:
                    terms.append(input_data_field if case_sensitive else
                                 input_data_field.lower())
                unique_terms[field_id] = get_unique_terms(
                    terms, self.term_forms[field_id],
                    self.tag_clouds.get(field_id, []))
                del input_data[field_id]
        nearest = {'centroid_id': None, 'centroid_name': None,
                   'distance': float('inf')}
        for centroid in self.centroids:
            distance2 = centroid.distance2(input_data, unique_terms,
                                           self.scales,
                                           stop_distance2=nearest['distance'])
            if distance2 is not None:
                nearest = {'centroid_id': centroid.centroid_id,
                           'centroid_name': centroid.name,
                           'distance': distance2}
        nearest['distance'] = math.sqrt(nearest['distance'])
        return nearest
Example #20
0
    def projection(self,
                   input_data,
                   max_components=None,
                   variance_threshold=None,
                   full=False):
        """Returns the projection of input data in the new components

        input_data: Input data to be projected

        """

        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=False)

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # Computes text and categorical field expansion into an input array of
        # terms and frequencies
        unique_terms = self.get_unique_terms(new_data)

        # Creates an input vector with the values for all expanded fields.
        # The input mask marks the non-missing or categorical fields
        # The `missings` variable is a boolean indicating whether there's
        # non-categorical fields missing
        input_array, missings, input_mask = self.expand_input(
            new_data, unique_terms)
        components = self.eigenvectors[:]
        if max_components is not None:
            components = components[0:max_components]
        if variance_threshold is not None:
            for index, cumulative in enumerate(self.cumulative_variance):
                if cumulative > variance_threshold:
                    components = components[0:index + 1]

        result = [value[0] for value in dot(components, [input_array])]

        # if non-categorical fields values are missing in input data
        # there's an additional normalization
        if missings:
            missing_sums = self.missing_factors(input_mask)
            for index, value in enumerate(result):
                result[index] = value / missing_sums[index] \
                    if missing_sums[index] > 0 else value
        if full:
            result = dict(zip(["PC%s" % index \
                for index in range(1, len(components) + 1)], result))
        return result
Example #21
0
    def projection(self, input_data, max_components=None,
                   variance_threshold=None, full=False):
        """Returns the projection of input data in the new components

        input_data: Input data to be projected

        """

        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=False)

        # Strips affixes for numeric values and casts to the final field type
        cast(new_data, self.fields)

        # Computes text and categorical field expansion into an input array of
        # terms and frequencies
        unique_terms = self.get_unique_terms(new_data)


        # Creates an input vector with the values for all expanded fields.
        # The input mask marks the non-missing or categorical fields
        # The `missings` variable is a boolean indicating whether there's
        # non-categorical fields missing
        input_array, missings, input_mask = self.expand_input(new_data,
                                                              unique_terms)
        components = self.eigenvectors[:]
        if max_components is not None:
            components = components[0: max_components]
        if variance_threshold is not None:
            for index, cumulative in enumerate(self.cumulative_variance):
                if cumulative > variance_threshold:
                    components = components[0: index + 1]

        result = [value[0] for value in dot(components, [input_array])]

        # if non-categorical fields values are missing in input data
        # there's an additional normalization
        if missings:
            missing_sums = self.missing_factors(input_mask)
            for index, value in enumerate(result):
                result[index] = value / missing_sums[index] \
                    if missing_sums[index] > 0 else value
        if full:
            result = dict(zip(["PC%s" % index \
                for index in range(1, len(components) + 1)], result))
        return result
Example #22
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Strips None values
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        # Checks input_data keys against field names and filters the ones
        # used in the model
        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])
        else:
            input_data = dict(
                [[key, value]
                    for key, value in input_data.items()
                    if key in self.tree.fields])

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #23
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Strips None values
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        # Checks input_data keys against field names and filters the ones
        # used in the model
        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])
        else:
            input_data = dict(
                [[key, value]
                    for key, value in input_data.items()
                    if key in self.tree.fields])

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #24
0
    def predict(self, input_data, by_name=True):
        """Returns the class prediction and the probability distribution

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS and
                        not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # Compute text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        for category in self.coefficients.keys():
            coefficients = self.coefficients[category]
            probabilities[category] = self.category_probability(
                input_data, unique_terms, coefficients)
            total += probabilities[category]
        for category in probabilities.keys():
            probabilities[category] /= total
        predictions = sorted(probabilities.items(),
                             key=lambda x: x[1], reverse=True)
        prediction, probability = predictions[0]
        return {
            "prediction": prediction,
            "probability": probability,
            "distribution": [{"category": category, "probability": probability}
                             for category, probability in predictions]}
Example #25
0
    def _prepare_for_distance(self, input_data, by_name=True):
        """Prepares the fields to be able to compute the distance2

        """
        # Checks and cleans input_data leaving the fields used in the model
        clean_input_data = self.filter_input_data(input_data, by_name=by_name)

        # Checks that all numeric fields are present in input data and
        # fills them with the default average (if given) when otherwise
        try:
            self.fill_numeric_defaults(clean_input_data,
                                       self.default_numeric_value)
        except ValueError:
            raise Exception("Missing values in input data. Input"
                            " data must contain values for all "
                            "numeric fields to compute a distance.")
        # Strips affixes for numeric values and casts to the final field type
        cast(clean_input_data, self.fields)

        unique_terms = self.get_unique_terms(clean_input_data)

        return clean_input_data, unique_terms
Example #26
0
    def _prepare_for_distance(self, input_data):
        """Prepares the fields to be able to compute the distance2

        """
        # Checks and cleans input_data leaving the fields used in the model
        clean_input_data = self.filter_input_data(input_data)

        # Checks that all numeric fields are present in input data and
        # fills them with the default average (if given) when otherwise
        try:
            self.fill_numeric_defaults(clean_input_data,
                                       self.default_numeric_value)
        except ValueError:
            raise Exception("Missing values in input data. Input"
                            " data must contain values for all "
                            "numeric fields to compute a distance.")
        # Strips affixes for numeric values and casts to the final field type
        cast(clean_input_data, self.fields)

        unique_terms = self.get_unique_terms(clean_input_data)

        return clean_input_data, unique_terms
Example #27
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #28
0
    def predict(self, input_data, by_name=True, add_unused_fields=False):
        """Returns the class prediction and the probability distribution
        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        add_unused_fields: Boolean, if True adds the information about the
                           fields in the input_data that are not being used
                           in the model as predictors.

        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS and
                        not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if self.balance_fields:
            for field in input_data:
                if self.fields[field]['optype'] == 'numeric':
                    mean = self.fields[field]['summary']['mean']
                    stddev = self.fields[field]['summary'][ \
                        'standard_deviation']
                    input_data[field] = (input_data[field] - mean) / stddev

        # Compute text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        for category in self.coefficients:
            probability = self.category_probability( \
                input_data, unique_terms, category)
            order = self.categories[self.objective_id].index(category)
            probabilities[category] = {"category": category,
                                       "probability": probability,
                                       "order": order}
            total += probabilities[category]["probability"]
        for category in probabilities:
            probabilities[category]["probability"] /= total
        predictions = sorted(probabilities.items(),
                             key=lambda x: (x[1]["probability"],
                                            - x[1]["order"]), reverse=True)
        for prediction, probability in predictions:
          del probability['order']
        prediction, probability = predictions[0]

        result = {
            "prediction": prediction,
            "probability": probability["probability"],
            "distribution": [{"category": category,
                              "probability": probability["probability"]}
                             for category, probability in predictions]}

        if add_unused_fields:
            result.update({'unused_fields': unused_fields})
        return result
Example #29
0
    def predict(self, input_data, by_name=True, add_unused_fields=False,
                operating_point=None, operating_kind=None):
        """Returns the class prediction and the probability distribution
        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        add_unused_fields: Boolean, if True adds the information about the
                           fields in the input_data that are not being used
                           in the model as predictors.
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        operating_kind: "probability". Sets the
                        property that decides the prediction. Used only if
                        no operating_point is used

        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # When operating_point is used, we need the probabilities
        # of all possible classes to decide, so se use
        # the `predict_probability` method
        if operating_point:
            return self.predict_operating( \
                input_data, by_name=False, operating_point=operating_point)
        if operating_kind:
            return self.predict_operating_kind( \
                input_data, by_name=False, operating_kind=operating_kind)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS and
                        not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")

        if self.balance_fields:
            balance_input(input_data, self.fields)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        # Computes the contributions for each category
        for category in self.coefficients:
            probability = self.category_probability( \
                input_data, unique_terms, category)
            try:
                order = self.categories[self.objective_id].index(category)
            except ValueError:
                if category == u'':
                    order = len(self.categories[self.objective_id])
            probabilities[category] = {"category": category,
                                       "probability": probability,
                                       "order": order}
            total += probabilities[category]["probability"]
        # Normalizes the contributions to get a probability
        for category in probabilities:
            probabilities[category]["probability"] /= total
            probabilities[category]["probability"] = round( \
                probabilities[category]["probability"], PRECISION)

        # Chooses the most probable category as prediction
        predictions = sorted(probabilities.items(),
                             key=lambda x: (x[1]["probability"],
                                            - x[1]["order"]), reverse=True)
        for prediction, probability in predictions:
            del probability['order']
        prediction, probability = predictions[0]

        result = {
            "prediction": prediction,
            "probability": probability["probability"],
            "distribution": [{"category": category,
                              "probability": probability["probability"]}
                             for category, probability in predictions]}

        if add_unused_fields:
            result.update({'unused_fields': unused_fields})
        return result
Example #30
0
    def predict(self,
                input_data,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                operating_kind=None,
                full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold (or confidence_threshold),
                           the probability (or confidence) that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
                         or
                           {"positive_class": "Iris-setosa",
                            "confidence_threshold": 0.5}
        operating_kind: "probability" or "confidence". Sets the
                        property that decides the prediction. Used only if
                        no operating_point is used
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - confidence: prediction's confidence
                  - probability: prediction's probability
                  - path: rules that lead to the prediction
                  - count: number of training instances supporting the
                           prediction
                  - next: field to check in the next split
                  - min: minim value of the training instances in the
                         predicted node
                  - max: maximum value of the training instances in the
                         predicted node
                  - median: median of the values of the training instances
                            in the predicted node
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data
        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        full_prediction = self._predict( \
            input_data, missing_strategy=missing_strategy,
            operating_point=operating_point, operating_kind=operating_kind,
            unused_fields=unused_fields)
        if full:
            return dict((key, value) for key, value in \
                full_prediction.items() if value is not None)

        return full_prediction['prediction']
Example #31
0
    def predict(self, input_data, by_name=True, add_unused_fields=False):
        """Returns the class prediction and the probability distribution
        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        add_unused_fields: Boolean, if True adds the information about the
                           fields in the input_data that are not being used
                           in the model as predictors.

        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data, by_name=by_name,
            add_unused_fields=add_unused_fields)
        if add_unused_fields:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            for field_id, field in self.fields.items():
                if (not field['optype'] in OPTIONAL_FIELDS and
                        not field_id in input_data):
                    raise Exception("Failed to predict. Input"
                                    " data must contain values for all numeric"
                                    " fields to get a logistic regression"
                                    " prediction.")

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if self.balance_fields:
            balance_input(input_data, self.fields)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        # Computes the contributions for each category
        for category in self.coefficients:
            probability = self.category_probability( \
                input_data, unique_terms, category)
            order = self.categories[self.objective_id].index(category)
            probabilities[category] = {"category": category,
                                       "probability": probability,
                                       "order": order}
            total += probabilities[category]["probability"]
        # Normalizes the contributions to get a probability
        for category in probabilities:
            probabilities[category]["probability"] /= total

        # Chooses the most probable category as prediction
        predictions = sorted(probabilities.items(),
                             key=lambda x: (x[1]["probability"],
                                            - x[1]["order"]), reverse=True)
        for prediction, probability in predictions:
            del probability['order']
        prediction, probability = predictions[0]

        result = {
            "prediction": prediction,
            "probability": probability["probability"],
            "distribution": [{"category": category,
                              "probability": probability["probability"]}
                             for category, probability in predictions]}

        if add_unused_fields:
            result.update({'unused_fields': unused_fields})
        return result
Example #32
0
    def predict(self, input_data, operating_point=None, operating_kind=None,
                full=False):
        """Makes a prediction based on a number of field values.

        input_data: Input data to be predicted
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        operating_kind: "probability". Sets the
                        property that decides the prediction. Used only if
                        no operating_point is used
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data, add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # When operating_point is used, we need the probabilities
        # of all possible classes to decide, so se use
        # the `predict_probability` method
        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            return self.predict_operating( \
                input_data, operating_point=operating_point)
        if operating_kind:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            return self.predict_operating_kind( \
                input_data, operating_kind=operating_kind)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        input_array = self.fill_array(input_data, unique_terms)

        if self.networks:
            prediction = self.predict_list(input_array)
        else:
            prediction = self.predict_single(input_array)
        if full:
            if not isinstance(prediction, dict):
                prediction = {"prediction": prediction}
            prediction.update({"unused_fields": unused_fields})
        else:
            if isinstance(prediction, dict):
                prediction = prediction["prediction"]

        return prediction
Example #33
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(
                utf8(u' AND '.join(prediction.path) +
                     u' => %s \n' % prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [
                prediction.output, prediction.confidence,
                prediction.distribution, prediction.count, prediction.median
            ]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all')
                        or (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction':
                        category,
                        'confidence':
                        ws_confidence(category, prediction.distribution),
                        'probability':
                        instances / total_instances,
                        'count':
                        instances
                    }
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count
                    or add_median or add_next):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update({
                        'distribution':
                        prediction.distribution,
                        'distribution_unit':
                        prediction.distribution_unit
                    })
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})

        return output
Example #34
0
    def predict(self,
                input_data,
                method=None,
                options=None,
                missing_strategy=LAST_PREDICTION,
                operating_point=None,
                operating_kind=None,
                median=False,
                full=False):
        """Makes a prediction based on the prediction made by every model.

        :param input_data: Test data to be used as input
        :param method: **deprecated**. Please check the `operating_kind`
                       attribute. Numeric key code for the following
                       combination methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        :param options: Options to be used in threshold filtered votes.
        :param missing_strategy: numeric key for the individual model's
                                 prediction method. See the model predict
                                 method.
        :param operating_point: In classification models, this is the point of
                                the ROC curve where the model will be used at.
                                The operating point can be defined in terms of:
                                  - the positive_class, the class that is
                                    important to predict accurately
                                  - its kind: probability, confidence or voting
                                  - its threshold: the minimum established
                                    for the positive_class to be predicted.
                                    The operating_point is then defined as a
                                    map with three attributes, e.g.:
                                       {"positive_class": "Iris-setosa",
                                        "kind": "probability",
                                        "threshold": 0.5}
        :param operating_kind: "probability", "confidence" or "votes". Sets the
                               property that decides the prediction.
                               Used only if no operating_point is used
        :param median: Uses the median of each individual model's predicted
                       node as individual prediction for the specified
                       combination method.
        :param full: Boolean that controls whether to include the prediction's
                     attributes. By default, only the prediction is produced.
                     If set to True, the rest of available information is
                     added in a dictionary format. The dictionary keys can be:
                      - prediction: the prediction value
                      - confidence: prediction's confidence
                      - probability: prediction's probability
                      - path: rules that lead to the prediction
                      - count: number of training instances supporting the
                               prediction
                      - next: field to check in the next split
                      - min: minim value of the training instances in the
                             predicted node
                      - max: maximum value of the training instances in the
                             predicted node
                      - median: median of the values of the training instances
                                in the predicted node
                      - unused_fields: list of fields in the input data that
                                       are not being used in the model
        """

        # Checks and cleans input_data leaving the fields used in the model
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        unused_fields = None
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        if median and method is None:
            # predictions with median are only available with old combiners
            method = PLURALITY_CODE

        if method is None and operating_point is None and \
            operating_kind is None and not median:
            # operating_point has precedence over operating_kind. If no
            # combiner is set, default operating kind is "probability"
            operating_kind = "probability"

        if operating_point:
            if self.regression:
                raise ValueError("The operating_point argument can only be"
                                 " used in classifications.")
            prediction = self.predict_operating( \
                input_data,
                missing_strategy=missing_strategy,
                operating_point=operating_point)
            if full:
                return prediction
            return prediction["prediction"]

        if operating_kind:
            if self.regression:
                # for regressions, operating_kind defaults to the old
                # combiners
                method = 1 if operating_kind == "confidence" else 0
                return self.predict( \
                    input_data, method=method,
                    options=options, missing_strategy=missing_strategy,
                    operating_point=None, operating_kind=None, full=full)
            prediction = self.predict_operating_kind( \
                input_data,
                missing_strategy=missing_strategy,
                operating_kind=operating_kind)
            return prediction

        if len(self.models_splits) > 1:
            # If there's more than one chunk of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([], boosting_offsets=self.boosting_offsets)

            for models_split in self.models_splits:
                models = self._get_models(models_split)
                multi_model = MultiModel(models,
                                         api=self.api,
                                         fields=self.fields)

                votes_split = multi_model._generate_votes(
                    input_data,
                    missing_strategy=missing_strategy,
                    unused_fields=unused_fields)
                if median:
                    for prediction in votes_split.predictions:
                        prediction['prediction'] = prediction['median']
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model._generate_votes(
                input_data,
                missing_strategy=missing_strategy,
                unused_fields=unused_fields)

            votes = MultiVote(votes_split.predictions,
                              boosting_offsets=self.boosting_offsets)
            if median:
                for prediction in votes.predictions:
                    prediction['prediction'] = prediction['median']

        if self.boosting is not None and not self.regression:
            categories = [ \
                d[0] for d in
                self.fields[self.objective_id]["summary"]["categories"]]
            options = {"categories": categories}
        result = votes.combine(method=method, options=options, full=full)
        if full:
            unused_fields = set(input_data.keys())
            for prediction in votes.predictions:
                unused_fields = unused_fields.intersection( \
                    set(prediction.get("unused_fields", [])))
            if not isinstance(result, dict):
                result = {"prediction": result}
            result['unused_fields'] = list(unused_fields)

        return result
Example #35
0
    def predict(self,
                input_data,
                operating_point=None,
                operating_kind=None,
                full=False):
        """Returns the class prediction and the probability distribution

        input_data: Input data to be predicted
        operating_point: In classification models, this is the point of the
                         ROC curve where the model will be used at. The
                         operating point can be defined in terms of:
                         - the positive_class, the class that is important to
                           predict accurately
                         - the probability_threshold,
                           the probability that is stablished
                           as minimum for the positive_class to be predicted.
                         The operating_point is then defined as a map with
                         two attributes, e.g.:
                           {"positive_class": "Iris-setosa",
                            "probability_threshold": 0.5}
        operating_kind: "probability". Sets the
                        property that decides the prediction. Used only if
                        no operating_point is used
        full: Boolean that controls whether to include the prediction's
              attributes. By default, only the prediction is produced. If set
              to True, the rest of available information is added in a
              dictionary format. The dictionary keys can be:
                  - prediction: the prediction value
                  - probability: prediction's probability
                  - distribution: distribution of probabilities for each
                                  of the objective field classes
                  - unused_fields: list of fields in the input data that
                                   are not being used in the model

        """

        # Checks and cleans input_data leaving the fields used in the model
        unused_fields = []
        new_data = self.filter_input_data( \
            input_data,
            add_unused_fields=full)
        if full:
            input_data, unused_fields = new_data
        else:
            input_data = new_data

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        # When operating_point is used, we need the probabilities
        # of all possible classes to decide, so se use
        # the `predict_probability` method
        if operating_point:
            return self.predict_operating( \
                input_data, operating_point=operating_point)
        if operating_kind:
            return self.predict_operating_kind( \
                input_data, operating_kind=operating_kind)

        # In case that missing_numerics is False, checks that all numeric
        # fields are present in input data.
        if not self.missing_numerics:
            check_no_missing_numerics(input_data, self.model_fields,
                                      self.weight_field)

        if self.balance_fields:
            balance_input(input_data, self.fields)

        # Computes text and categorical field expansion
        unique_terms = self.get_unique_terms(input_data)

        probabilities = {}
        total = 0
        # Computes the contributions for each category
        for category in self.coefficients:
            probability = self.category_probability( \
                input_data, unique_terms, category)
            try:
                order = self.categories[self.objective_id].index(category)
            except ValueError:
                if category == '':
                    order = len(self.categories[self.objective_id])
            probabilities[category] = {
                "category": category,
                "probability": probability,
                "order": order
            }
            total += probabilities[category]["probability"]
        # Normalizes the contributions to get a probability
        for category in probabilities:
            probabilities[category]["probability"] /= total
            probabilities[category]["probability"] = round( \
                probabilities[category]["probability"], PRECISION)

        # Chooses the most probable category as prediction
        predictions = sorted(list(probabilities.items()),
                             key=lambda x:
                             (x[1]["probability"], -x[1]["order"]),
                             reverse=True)
        for prediction, probability in predictions:
            del probability['order']
        prediction, probability = predictions[0]

        result = {
            "prediction":
            prediction,
            "probability":
            probability["probability"],
            "distribution": [{
                "category": category,
                "probability": probability["probability"]
            } for category, probability in predictions]
        }

        if full:
            result.update({'unused_fields': unused_fields})
        else:
            result = result["prediction"]

        return result
Example #36
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                add_min=False,
                add_max=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        add_min: Boolean, if True adds the minimum value in the prediction's
                 distribution (for regressions only)
        add_max: Boolean, if True adds the maximum value in the prediction's
                 distribution (for regressions only)
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL and
                not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' %
                           prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [prediction.output,
                      prediction.confidence,
                      prediction.distribution,
                      prediction.count,
                      prediction.median]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all') or
                        (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction': category,
                        'confidence': ws_confidence(category,
                                                    prediction.distribution),
                        'probability': instances / total_instances,
                        'count': instances}
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count or
                    add_median or add_next or add_min or add_max):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update(
                        {'distribution': prediction.distribution,
                         'distribution_unit': prediction.distribution_unit})
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})
                if self.tree.regression and add_min:
                    output.update({'min': prediction.min})
                if self.tree.regression and add_max:
                    output.update({'max': prediction.max})

        return output