Example #1
0
    def summarize(self, out=sys.stdout):
        """Prints a summary of the cluster info

        """
        out.write(u"Cluster of %s centroids\n\n" % len(self.centroids))

        out.write(u"Data distribution:\n")
        print_distribution(self.get_data_distribution(), out=out)
        out.write(u"\n\n")
        centroids_list = sorted(self.centroids, key=lambda x: x.name)

        out.write(u"Centroids features:\n")
        for centroid in centroids_list:
            out.write(utf8(u"\n%s: " % centroid.name))
            connector = ""
            for field_id, value in centroid.center.items():
                if isinstance(value, basestring):
                    value = u"\"%s\"" % value
                out.write(
                    utf8(u"%s%s: %s" %
                         (connector, self.fields[field_id]['name'], value)))
                connector = ", "
        out.write(u"\n\n")

        out.write(u"Data distance statistics:\n\n")
        for centroid in centroids_list:
            centroid.print_statistics(out=out)

        out.write(u"Intercentroids distance:\n\n")
        for centroid in centroids_list:
            out.write(utf8(u"To centroid: %s\n" % centroid.name))
            for measure, result in self.centroids_distance(centroid):
                out.write(u"%s%s: %s\n" % (INDENT, measure, result))
            out.write(u"\n")
Example #2
0
    def summarize(self, out=sys.stdout):
        """Prints a summary of the cluster info

        """
        out.write(u"Cluster of %s centroids\n\n" % len(self.centroids))

        out.write(u"Data distribution:\n")
        print_distribution(self.get_data_distribution(), out=out)
        out.write(u"\n\n")
        centroids_list = sorted(self.centroids, key=lambda x: x.name)

        out.write(u"Centroids features:\n")
        for centroid in centroids_list:
            out.write(utf8(u"\n%s: " % centroid.name))
            connector = ""
            for field_id, value in centroid.center.items():
                if isinstance(value, basestring):
                    value = u"\"%s\"" % value
                out.write(utf8(u"%s%s: %s" % (connector,
                                              self.fields[field_id]['name'],
                                              value)))
                connector = ", "
        out.write(u"\n\n")

        out.write(u"Data distance statistics:\n\n")
        for centroid in centroids_list:
            centroid.print_statistics(out=out)

        if len(self.centroids) > 1:
            out.write(u"Intercentroids distance:\n\n")
            for centroid in centroids_list:
                out.write(utf8(u"To centroid: %s\n" % centroid.name))
                for measure, result in self.centroids_distance(centroid):
                    out.write(u"%s%s: %s\n" % (INDENT, measure, result))
                out.write(u"\n")
Example #3
0
    def summarize(self, out=sys.stdout):
        """Prints a summary of the cluster info

        """
        report_header = ''
        if self.is_g_means:
            report_header = \
                u'G-means Cluster (critical_value=%d)' % self.critical_value
        else:
            report_header = u'K-means Cluster (k=%d)' % self.k

        out.write(report_header + ' with %d centroids\n\n' %
                  len(self.centroids))

        out.write(u"Data distribution:\n")
        # "Global" is set as first entry
        self.print_global_distribution(out=out)
        print_distribution(self.get_data_distribution(), out=out)
        out.write(u"\n")
        centroids_list = [self.cluster_global] if self.cluster_global else []
        centroids_list.extend(sorted(self.centroids, key=lambda x: x.name))

        out.write(u"Cluster metrics:\n")
        self.print_ss_metrics(out=out)
        out.write(u"\n")


        out.write(u"Centroids:\n")
        for centroid in centroids_list:
            out.write(utf8(u"\n%s%s: " % (INDENT, centroid.name)))
            connector = ""
            for field_id, value in centroid.center.items():
                if isinstance(value, basestring):
                    value = u"\"%s\"" % value
                out.write(utf8(u"%s%s: %s" % (connector,
                                              self.fields[field_id]['name'],
                                              value)))
                connector = ", "
        out.write(u"\n\n")

        out.write(u"Distance distribution:\n\n")
        for centroid in centroids_list:
            centroid.print_statistics(out=out)
        out.write(u"\n")

        if len(self.centroids) > 1:
            out.write(u"Intercentroid distance:\n\n")
            centroids_list = (centroids_list[1:] if self.cluster_global else
                              centroids_list)
            for centroid in centroids_list:
                out.write(utf8(u"%sTo centroid: %s\n" % (INDENT,
                                                         centroid.name)))
                for measure, result in self.centroids_distance(centroid):
                    out.write(u"%s%s: %s\n" % (INDENT * 2, measure, result))
                out.write(u"\n")
Example #4
0
    def summarize(self, out=sys.stdout):
        """Prints a summary of the cluster info

        """
        report_header = ''
        if self.is_g_means:
            report_header = \
                'G-means Cluster (critical_value=%d)' % self.critical_value
        else:
            report_header = 'K-means Cluster (k=%d)' % self.k

        out.write(report_header + ' with %d centroids\n\n' %
                  len(self.centroids))

        out.write("Data distribution:\n")
        # "Global" is set as first entry
        self.print_global_distribution(out=out)
        print_distribution(self.get_data_distribution(), out=out)
        out.write("\n")
        centroids_list = [self.cluster_global] if self.cluster_global else []
        centroids_list.extend(sorted(self.centroids, key=lambda x: x.name))

        out.write("Cluster metrics:\n")
        self.print_ss_metrics(out=out)
        out.write("\n")


        out.write("Centroids:\n")
        for centroid in centroids_list:
            out.write(utf8("\n%s%s: " % (INDENT, centroid.name)))
            connector = ""
            for field_id, value in list(centroid.center.items()):
                if isinstance(value, str):
                    value = "\"%s\"" % value
                out.write(utf8("%s%s: %s" % (connector,
                                             self.fields[field_id]['name'],
                                             value)))
                connector = ", "
        out.write("\n\n")

        out.write("Distance distribution:\n\n")
        for centroid in centroids_list:
            centroid.print_statistics(out=out)
        out.write("\n")

        if len(self.centroids) > 1:
            out.write("Intercentroid distance:\n\n")
            centroids_list = (centroids_list[1:] if self.cluster_global else
                              centroids_list)
            for centroid in centroids_list:
                out.write(utf8("%sTo centroid: %s\n" % (INDENT,
                                                        centroid.name)))
                for measure, result in self.centroids_distance(centroid):
                    out.write("%s%s: %s\n" % (INDENT * 2, measure, result))
                out.write("\n")
Example #5
0
    def list_fields(self, out):
        """Lists a description of the model's fields.

        """
        out.write(utf8(u'<%-32s : %s>\n' % (
            self.fields[self.objective_field]['name'],
            self.fields[self.objective_field]['optype'])))
        out.flush()

        for field in [(val['name'], val['optype']) for key, val in
                      sort_fields(self.fields)
                      if key != self.objective_field]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields
Example #6
0
    def list_fields(self, out):
        """Lists a description of the model's fields.

        """
        out.write(utf8(u'<%-32s : %s>\n' % (
            self.fields[self.objective_id]['name'],
            self.fields[self.objective_id]['optype'])))
        out.flush()

        for field in [(val['name'], val['optype']) for key, val in
                      sort_fields(self.fields)
                      if key != self.objective_id]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields
Example #7
0
def list_fields(model, out=sys.stdout):
    """Prints descriptions of the fields for this model.

    """
    out.write(
        utf8('<%-32s : %s>\n' % (model.fields[model.objective_id]['name'],
                                 model.fields[model.objective_id]['optype'])))
    out.flush()

    for field in [(val['name'], val['optype'])
                  for key, val in sort_fields(model.fields)
                  if key != model.objective_id]:
        out.write(utf8('[%-32s : %s]\n' % (field[0], field[1])))
        out.flush()
    return model.fields
Example #8
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #9
0
    def python(self, out, docstring, input_map=False):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        reserved_keywords = keyword.kwlist if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'],
                           reserved_keywords=reserved_keywords, prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_field:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_field]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                   (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring +
                         u"\n" + INDENT + u"\"\"\"\n")
        body, term_analysis_predicates = self.python_body(input_map=input_map)
        terms_body = ""
        if term_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates)
        predictor += predictor_doc + terms_body + body
        out.write(utf8(predictor))
        out.flush()
Example #10
0
    def python(self, out, docstring, input_map=False):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        reserved_keywords = keyword.kwlist if not input_map else None
        prefix = "_" if not input_map else ""
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'],
                           reserved_keywords=reserved_keywords,
                           prefix=prefix)
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_field:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_field]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                    (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT +
                         u"\"\"\"\n")
        body, term_analysis_predicates = self.python_body(input_map=input_map)
        terms_body = ""
        if term_analysis_predicates:
            terms_body = self.term_analysis_body(term_analysis_predicates)
        predictor += predictor_doc + terms_body + body
        out.write(utf8(predictor))
        out.flush()
Example #11
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL and
                not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #12
0
File: model.py Project: rmmx/python
    def hadoop_python_reducer(self, out=sys.stdout):
        """Returns a hadoop reducer to make predictions in python

        """

        output = \
u"""#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys

count = 0
previous = None

def print_result(values, prediction, count):
    \"\"\"Prints input data and predicted value as an ordered list.

    \"\"\"
    result = \"[%s, %s]\" % (values, prediction)
    print u\"%s\\t%s\" % (result, count)

for line in sys.stdin:
    values, prediction = line.strip().split('\\t')
    if previous is None:
        previous = (values, prediction)
    if values != previous[0]:
        print_result(previous[0], previous[1], count)
        previous = (values, prediction)
        count = 0
    count += 1
if count > 0:
    print_result(previous[0], previous[1], count)
"""
        out.write(utf8(output))
        out.flush()
Example #13
0
    def python(self, out, docstring, input_map=False):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_field:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_field]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                   (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring +
                         u"\n" + INDENT + u"\"\"\"\n")
        predictor += predictor_doc + self.python_body(input_map=input_map)
        out.write(utf8(predictor))
        out.flush()
Example #14
0
    def python(self, out, docstring, input_map=False):
        """Writes a python function that implements the model.

        """
        args = []
        parameters = sort_fields(self.fields)
        if not input_map:
            input_map = len(parameters) > MAX_ARGS_LENGTH
        for field in [(key, val) for key, val in parameters]:
            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
            if not input_map:
                if field[0] != self.objective_field:
                    args.append("%s=None" % (slug))
        if input_map:
            args.append("data={}")
        predictor_definition = (u"def predict_%s" %
                                self.fields[self.objective_field]['slug'])
        depth = len(predictor_definition) + 1
        predictor = u"%s(%s):\n" % (predictor_definition,
                                   (",\n" + " " * depth).join(args))
        predictor_doc = (INDENT + u"\"\"\" " + docstring +
                         u"\n" + INDENT + u"\"\"\"\n")
        predictor += predictor_doc + self.python_body(input_map=input_map)
        out.write(utf8(predictor))
        out.flush()
Example #15
0
    def hadoop_python_reducer(self, out=sys.stdout):
        """Returns a hadoop reducer to make predictions in python

        """

        output = \
u"""#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys

count = 0
previous = None

def print_result(values, prediction, count):
    \"\"\"Prints input data and predicted value as an ordered list.

    \"\"\"
    result = \"[%s, %s]\" % (values, prediction)
    print u\"%s\\t%s\" % (result, count)

for line in sys.stdin:
    values, prediction = line.strip().split('\\t')
    if previous is None:
        previous = (values, prediction)
    if values != previous[0]:
        print_result(previous[0], previous[1], count)
        previous = (values, prediction)
        count = 0
    count += 1
if count > 0:
    print_result(previous[0], previous[1], count)
"""
        out.write(utf8(output))
        out.flush()
Example #16
0
def i_check_the_data_distribution(step, file):
    distribution = g.get_data_distribution(world.local_model)

    distribution_str = ''
    for bin_value, bin_instances in distribution:
        distribution_str += "[%s,%s]\n" % (bin_value, bin_instances)
    world.output = utf8(distribution_str)
    i_check_if_the_output_is_like_expected_file(step, file)
Example #17
0
def print_importance(instance, out=sys.stdout):
    """Print a field importance structure

    """
    count = 1
    field_importance, fields = instance.field_importance_data()
    for [field, importance] in field_importance:
        out.write(utf8(u"    %s. %s: %.2f%%\n" % (count, fields[field]["name"], round(importance, 4) * 100)))
        count += 1
Example #18
0
def hadoop_python_reducer(out=sys.stdout):
    """Generates a hadoop reducer to make predictions in python

    """

    with open(HADOOP_REDUCER_TEMPLATE) as template_handler:
        output = template_handler.read()
    out.write(utf8(output))
    out.flush()
Example #19
0
        def print_importance(out=sys.stdout):
            """Prints field importance

            """
            count = 1
            for [field, importance] in self.field_importance:
                out.write(utf8(u"    %s. %s: %.2f%%\n" % (count,
                               self.tree.fields[field]['name'],
                               round(importance, 4) * 100)))
                count += 1
Example #20
0
    def rules(self, out):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in [(key, val) for key, val in sort_fields(self.fields)]:

            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
        out.write(utf8(self.generate_rules()))
        out.flush()
Example #21
0
        def print_importance(out=sys.stdout):
            """Prints field importance

            """
            count = 1
            for [field, importance] in self.field_importance:
                out.write(utf8(u"    %s. %s: %.2f%%\n" % (count,
                               self.tree.fields[field]['name'],
                               round(importance, 4) * 100)))
                count += 1
Example #22
0
    def list_fields(self, out):
        """Lists a description of the model's fields.

        """

        for field in [(val['name'], val['optype']) for _, val in
                      sort_fields(self.fields)]:
            out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1])))
            out.flush()
        return self.fields
Example #23
0
def print_distribution(distribution, out=sys.stdout):
    """Prints distribution data

    """
    total = reduce(lambda x, y: x + y, [group[1] for group in distribution])
    for group in distribution:
        out.write(
            utf8(u"    %s: %.2f%% (%d instance%s)\n" %
                 (group[0], round(group[1] * 1.0 / total, 4) * 100, group[1],
                  "" if group[1] == 1 else "s")))
Example #24
0
    def tableau(self, out, ids_path=None, subtree=True):
        """Writes a Tableau function that implements the model.

        """
        body = self.tableau_body(ids_path=ids_path, subtree=subtree)
        if not body:
            return False
        out.write(utf8(body))
        out.flush()
        return True
Example #25
0
    def tableau(self, out, ids_path=None, subtree=True):
        """Writes a Tableau function that implements the model.

        """
        body = self.tableau_body(ids_path=ids_path, subtree=subtree)
        if not body:
            return False
        out.write(utf8(body))
        out.flush()
        return True
Example #26
0
def i_check_the_predictions_distribution(step, file):
    predictions = g.get_prediction_distribution(world.local_model)

    distribution_str = ''
    for group, instances in predictions:
        distribution_str += "[%s,%s]\n" % (group, instances)

    world.output = utf8(distribution_str)

    i_check_if_the_output_is_like_expected_file(step, file)
Example #27
0
    def rules(self, out):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in [(key, val) for key, val in sort_fields(self.fields)]:

            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
        out.write(utf8(self.generate_rules()))
        out.flush()
Example #28
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction_info = self.tree.predict(input_data,
                                            missing_strategy=missing_strategy)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        output = prediction
        if with_confidence:
            output = [prediction, confidence, distribution, instances]
        if add_confidence or add_path or add_distribution or add_count:
            output = {'prediction': prediction}
            if add_confidence:
                output.update({'confidence': confidence})
            if add_path:
                rules = path
                output.update({'path': rules})
            if add_distribution:
                output.update({'distribution': distribution})
            if add_count:
                output.update({'count': instances})

        return output
Example #29
0
    def python(self, out=sys.stdout):
        """Generates the code in python that creates the forecasts

        """
        attributes = [u"l", u"b", u"s", u"phi", u"value", u"slope"]
        components = {}
        model_components = {}
        model_names = []
        out.write(
            utf8(USAGE_DOC %
                 (self.resource_id, self.fields[self.objective_id]["name"])))
        output = [u"COMPONENTS = \\"]
        for field_id, models in self.ets_models.items():
            for model in models:
                final_state = model.get("final_state", {})
                attrs = {}
                for attribute in attributes:
                    if attribute in model:
                        attrs.update({attribute: model[attribute]})
                    elif attribute in final_state:
                        attrs.update( \
                            {attribute: final_state[attribute]})
                model_names.append(model["name"])
                model_components[model["name"]] = attrs
            field_name = self.fields[field_id]["name"]
            if field_name not in components:
                components[field_name] = model_components
        partial_output = StringIO.StringIO()
        pprint.pprint(components, stream=partial_output)
        for line in partial_output.getvalue().split("\n"):
            output.append(u"%s%s" % (INDENT, line))

        out.write(utf8(u"\n".join(output)))

        model_names = list(set(model_names))
        if any(name in model_names for name in ["naive", "mean"]):
            out.write(utf8(TRIVIAL_MODEL))
        if any("," in name and name.split(",")[2] in ["A", "M"] for \
               name in model_names):
            out.write(utf8(SEASONAL_CODE))
        trends = [name.split(",")[1] for name in model_names if "," in name]
        trends.extend([name for name in model_names if "," not in name])
        trends = set(trends)
        models_function = []
        for trend in trends:
            models_function.append("\"%s\": _%s_forecast" % (trend, trend))
            out.write(utf8(SUBMODELS_CODE[trend]))
        out.write(utf8(u"\n\nMODELS = \\\n"))
        out.write(utf8("%s%s%s" % \
            (u"    {", u",\n     ".join(models_function), u"}")))

        out.write(utf8(FORECAST_FUNCTION))
Example #30
0
        def print_distribution(distribution, out=sys.stdout):
            """Prints distribution data

            """
            total = reduce(lambda x, y: x + y,
                           [group[1] for group in distribution])
            for group in distribution:
                out.write(utf8(u"    %s: %.2f%% (%d instance%s)\n" % (group[0],
                               round(group[1] * 1.0 / total, 4) * 100,
                               group[1],
                               "" if group[1] == 1 else "s")))
Example #31
0
def print_importance(instance, out=sys.stdout):
    """Print a field importance structure

    """
    count = 1
    field_importance, fields = instance.field_importance_data()
    for [field, importance] in field_importance:
        out.write(
            utf8(u"    %s. %s: %.2f%%\n" %
                 (count, fields[field]['name'], round(importance, 4) * 100)))
        count += 1
Example #32
0
    def python(self, out=sys.stdout):
        """Generates the code in python that creates the forecasts

        """
        attributes = [u"l", u"b", u"s", u"phi", u"value", u"slope"]
        components = {}
        model_components = {}
        model_names = []
        out.write(utf8(USAGE_DOC % (self.resource_id,
                                    self.fields[self.objective_id]["name"])))
        output = [u"COMPONENTS = \\"]
        for field_id, models in self.ets_models.items():
            for model in models:
                final_state = model.get("final_state", {})
                attrs = {}
                for attribute in attributes:
                    if attribute in model:
                        attrs.update({attribute: model[attribute]})
                    elif attribute in final_state:
                        attrs.update( \
                            {attribute: final_state[attribute]})
                model_names.append(model["name"])
                model_components[model["name"]] = attrs
            field_name = self.fields[field_id]["name"]
            if field_name not in components:
                components[field_name] = model_components
        partial_output = StringIO.StringIO()
        pprint.pprint(components, stream=partial_output)
        for line in partial_output.getvalue().split("\n"):
            output.append(u"%s%s" % (INDENT, line))

        out.write(utf8(u"\n".join(output)))

        model_names = set(model_names)
        if (any(name in model_names for name in ["naive", "mean"])):
            out.write(utf8(TRIVIAL_MODEL))
        if (any("," in name and name.split(",")[2] in ["A", "M"] for \
                name in model_names)):
            out.write(utf8(SEASONAL_CODE))
        trends = [name.split(",")[1] for name in model_names if "," in name]
        trends.extend([name for name in model_names if "," not in name])
        trends = set(trends)
        models_function = []
        for trend in trends:
            models_function.append("\"%s\": _%s_forecast" % (trend, trend))
            out.write(utf8(SUBMODELS_CODE[trend]))
        out.write(utf8(u"\n\nMODELS = \\\n"))
        out.write(utf8("%s%s%s" % \
            (u"    {", u",\n     ".join(models_function), u"}")))

        out.write(utf8(FORECAST_FUNCTION))
Example #33
0
def centroid_features(centroid, field_ids, encode=True):
    """Returns features defining the centroid according to the list
       of common field ids that define the centroids.

    """
    features = []
    for field_id in field_ids:
        value = centroid.center[field_id]
        if isinstance(value, str) and encode:
            value = utf8(value)
        features.append(value)
    return features
Example #34
0
def tree_csv(model, file_name=None, leaves_only=False):
    """Outputs the node structure to a CSV file or array

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    headers_names = []
    if model.regression:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("error")
        max_bins = get_node(model.tree)[model.offsets["max_bins"]]
        for index in range(0, max_bins):
            headers_names.append("bin%s_value" % index)
            headers_names.append("bin%s_instances" % index)
    else:
        headers_names.append(model.fields[model.objective_id]['name'])
        headers_names.append("confidence")
        headers_names.append("impurity")
        node = get_node(model.tree)
        for category, _ in node[model.offsets["distribution"]]:
            headers_names.append(category)

    nodes_generator = get_nodes_info(model,
                                     headers_names,
                                     leaves_only=leaves_only)
    if file_name is not None:
        with UnicodeWriter(file_name) as writer:
            writer.writerow([utf8(header) for header in headers_names])
            for row in nodes_generator:
                writer.writerow([
                    item if not isinstance(item, str) else utf8(item)
                    for item in row
                ])
        return file_name
    rows = []
    rows.append(headers_names)
    for row in nodes_generator:
        rows.append(row)
    return rows
Example #35
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])

        for (key, value) in input_data.items():
            if ((self.tree.fields[key]['optype'] == 'numeric' and
                    isinstance(value, basestring)) or (
                    self.tree.fields[key]['optype'] != 'numeric' and
                    not isinstance(value, basestring))):
                try:
                    input_data.update({key:
                                       map_type(self.tree.fields[key]
                                                ['optype'])(value)})
                except:
                    raise Exception(u"Mismatch input data type in field "
                                    u"\"%s\" for value %s." %
                                    (self.tree.fields[key]['name'],
                                     value))

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #36
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Strips None values
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        # Checks input_data keys against field names and filters the ones
        # used in the model
        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])
        else:
            input_data = dict(
                [[key, value]
                    for key, value in input_data.items()
                    if key in self.tree.fields])

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #37
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Strips None values
        empty_fields = [(key, value) for (key, value) in input_data.items()
                        if value is None]
        for (key, value) in empty_fields:
            del input_data[key]

        # Checks input_data keys against field names and filters the ones
        # used in the model
        if by_name:
            wrong_keys = [key for key in input_data.keys() if not key
                          in self.all_inverted_fields]
            if wrong_keys:
                LOGGER.error("Wrong field names in input data: %s" %
                             ", ".join(wrong_keys))
            input_data = dict(
                [[self.inverted_fields[key], value]
                    for key, value in input_data.items()
                    if key in self.inverted_fields])
        else:
            input_data = dict(
                [[key, value]
                    for key, value in input_data.items()
                    if key in self.tree.fields])

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #38
0
    def tree_rules(tree,
                   offsets,
                   objective_id,
                   fields,
                   out,
                   ids_path=None,
                   subtree=True):
        """Prints out an IF-THEN rule version of the tree.

        """
        for field in sort_fields(fields):

            slug = slugify(fields[field[0]]['name'])
            fields[field[0]].update(slug=slug)
        out.write(
            utf8(
                generate_rules(tree,
                               offsets,
                               objective_id,
                               fields,
                               ids_path=ids_path,
                               subtree=subtree)))
        out.flush()
Example #39
0
def tree_tableau(tree,
                 offsets,
                 fields,
                 objective_id,
                 out,
                 ids_path=None,
                 subtree=True,
                 attr=DFT_ATTR):
    """Writes a Tableau function that implements the model.

    """
    body = tableau_body(tree,
                        offsets,
                        fields,
                        objective_id,
                        ids_path=ids_path,
                        subtree=subtree,
                        attr=attr)
    if not body:
        return False
    out.write(utf8(body))
    out.flush()
    return True
Example #40
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        """
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.tree.fields)

        prediction_info = self.tree.predict(input_data)
        prediction, path, confidence, distribution, instances = prediction_info

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction))
            out.flush()
        if with_confidence:
            return [prediction, confidence, distribution, instances]
        return prediction
Example #41
0
File: model.py Project: rmmx/python
    def hadoop_python_mapper(self, out=sys.stdout, ids_path=None,
                             subtree=True):
        """Returns a hadoop mapper header to make predictions in python

        """
        input_fields = [(value, key) for (key, value) in
                        sorted(self.inverted_fields.items(),
                               key=lambda x: x[1])]
        parameters = [value for (key, value) in
                      input_fields if key != self.tree.objective_id]
        args = []
        for field in input_fields:
            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
            if field[0] != self.tree.objective_id:
                args.append("\"" + self.fields[field[0]]['slug'] + "\"")
        output = \
u"""#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import csv
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')


class CSVInput(object):
    \"\"\"Reads and parses csv input from stdin

       Expects a data section (without headers) with the following fields:
       %s

       Data is processed to fall into the corresponding input type by applying
       INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can
       also provide strings to be considered as no content markers in
       MISSING_TOKENS.
    \"\"\"
    def __init__(self, input=sys.stdin):
        \"\"\" Opens stdin and defines parsing constants

        \"\"\"
        try:
            self.reader = csv.reader(input, delimiter=',', quotechar='\"')
""" % ",".join(parameters)

        output += (
            u"\n%sself.INPUT_FIELDS = [%s]\n" %
            ((INDENT * 3), (",\n " + INDENT * 8).join(args)))

        input_types = []
        prefixes = []
        suffixes = []
        count = 0
        fields = self.fields
        for key in [key[0] for key in input_fields
                    if key != self.tree.objective_id]:
            input_type = ('None' if not fields[key]['datatype'] in
                          PYTHON_CONV
                          else PYTHON_CONV[fields[key]['datatype']])
            input_types.append(input_type)
            if 'prefix' in fields[key]:
                prefixes.append("%s: %s" % (count,
                                            repr(fields[key]['prefix'])))
            if 'suffix' in fields[key]:
                suffixes.append("%s: %s" % (count,
                                            repr(fields[key]['suffix'])))
            count += 1
        static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(input_types),
                                 "]\n")
        static_content = "%sself.PREFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(prefixes),
                                 "}\n")
        static_content = "%sself.SUFFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content,
                                 formatter.join(suffixes),
                                 "}\n")
        output += \
u"""            self.MISSING_TOKENS = ['?']
        except Exception, exc:
            sys.stderr.write(\"Cannot read csv\"
                             \" input. %s\\n\" % str(exc))

    def __iter__(self):
        \"\"\" Iterator method

        \"\"\"
        return self

    def next(self):
        \"\"\" Returns processed data in a list structure

        \"\"\"
        def normalize(value):
            \"\"\"Transforms to unicode and cleans missing tokens
            \"\"\"
            value = unicode(value.decode('utf-8'))
            return \"\" if value in self.MISSING_TOKENS else value

        def cast(function_value):
            \"\"\"Type related transformations
            \"\"\"
            function, value = function_value
            if not len(value):
                return None
            if function is None:
                return value
            else:
                return function(value)

        try:
            values = self.reader.next()
        except StopIteration:
            raise StopIteration()
        if len(values) < len(self.INPUT_FIELDS):
            sys.stderr.write(\"Found %s fields when %s were expected.\\n\" %
                             (len(values), len(self.INPUT_FIELDS)))
            raise StopIteration()
        else:
            values = values[0:len(self.INPUT_FIELDS)]
        try:
            values = map(normalize, values)
            for key in self.PREFIXES:
                prefix_len = len(self.PREFIXES[key])
                if values[key][0:prefix_len] == self.PREFIXES[key]:
                    values[key] = values[key][prefix_len:]
            for key in self.SUFFIXES:
                suffix_len = len(self.SUFFIXES[key])
                if values[key][-suffix_len:] == self.SUFFIXES[key]:
                    values[key] = values[key][0:-suffix_len]
            function_tuples = zip(self.INPUT_TYPES, values)
            values = map(cast, function_tuples)
            data = {}
            for i in range(len(values)):
                data.update({self.INPUT_FIELDS[i]: values[i]})
            return data
        except Exception, exc:
            sys.stderr.write(\"Error in data transformations. %s\\n\" %
                             str(exc))
            return False
\n\n
"""
        out.write(utf8(output))
        out.flush()

        self.tree.python(out, self.docstring(),
                         input_map=True,
                         ids_path=ids_path,
                         subtree=subtree)
        output = \
u"""
csv = CSVInput()
for values in csv:
    if not isinstance(values, bool):
        print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values)))
\n\n
""" % fields[self.tree.objective_id]['slug']
        out.write(utf8(output))
        out.flush()
Example #42
0
def summarize(model, out=sys.stdout, format=BRIEF):
    """Prints summary grouping distribution as class header and details

    """
    if model.boosting:
        raise AttributeError("This method is not available for boosting"
                             " models.")
    tree = model.tree

    def extract_common_path(groups):
        """Extracts the common segment of the prediction path for a group

        """
        for group in groups:
            details = groups[group]['details']
            common_path = []
            if len(details) > 0:
                mcd_len = min([len(x[0]) for x in details])
                for i in range(0, mcd_len):
                    test_common_path = details[0][0][i]
                    for subgroup in details:
                        if subgroup[0][i] != test_common_path:
                            i = mcd_len
                            break
                    if i < mcd_len:
                        common_path.append(test_common_path)
            groups[group]['total'][0] = common_path
            if len(details) > 0:
                groups[group]['details'] = sorted(details,
                                                  key=lambda x: x[1],
                                                  reverse=True)

    def confidence_error(value, impurity=None):
        """Returns confidence for categoric objective fields
           and error for numeric objective fields
        """
        if value is None:
            return ""
        impurity_literal = ""
        if impurity is not None and impurity > 0:
            impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4))
        objective_type = model.fields[model.objective_id]['optype']
        if objective_type == 'numeric':
            return " [Error: %s]" % value
        return " [Confidence: %.2f%%%s]" % (round(value, 4) * 100,
                                            impurity_literal)

    distribution = get_data_distribution(model)

    out.write(utf8("Data distribution:\n"))
    print_distribution(distribution, out=out)
    out.write(utf8("\n\n"))

    groups = group_prediction(model)
    predictions = get_prediction_distribution(model, groups)

    out.write(utf8("Predicted distribution:\n"))
    print_distribution(predictions, out=out)
    out.write(utf8("\n\n"))

    if model.field_importance:
        out.write(utf8("Field importance:\n"))
        print_importance(model, out=out)

    extract_common_path(groups)

    out.write(utf8("\n\nRules summary:"))

    node = get_node(tree)
    count = node[model.offsets["count"]]
    for group in [x[0] for x in predictions]:
        details = groups[group]['details']
        path = Path(groups[group]['total'][0])
        data_per_group = groups[group]['total'][1] * 1.0 / count
        pred_per_group = groups[group]['total'][2] * 1.0 / count
        out.write(
            utf8("\n\n%s : (data %.2f%% / prediction %.2f%%) %s" %
                 (group, round(data_per_group, 4) * 100,
                  round(pred_per_group, 4) * 100,
                  path.to_rules(model.fields, format=format))))

        if len(details) == 0:
            out.write(
                utf8("\n    The model will never predict this"
                     " class\n"))
        elif len(details) == 1:
            subgroup = details[0]
            out.write(
                utf8("%s\n" %
                     confidence_error(subgroup[2], impurity=subgroup[3])))
        else:
            out.write(utf8("\n"))
            for subgroup in details:
                pred_per_sgroup = subgroup[1] * 1.0 / \
                    groups[group]['total'][2]
                path = Path(subgroup[0])
                path_chain = path.to_rules(model.fields, format=format) if \
                    path.predicates else "(root node)"
                out.write(
                    utf8(
                        "    · %.2f%%: %s%s\n" %
                        (round(pred_per_sgroup, 4) * 100, path_chain,
                         confidence_error(subgroup[2], impurity=subgroup[3]))))

    out.flush()
Example #43
0
    def hadoop_python_mapper(self,
                             out=sys.stdout,
                             ids_path=None,
                             subtree=True):
        """Returns a hadoop mapper header to make predictions in python

        """
        input_fields = [(value, key) for (
            key,
            value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])]
        parameters = [
            value for (key, value) in input_fields
            if key != self.tree.objective_id
        ]
        args = []
        for field in input_fields:
            slug = slugify(self.fields[field[0]]['name'])
            self.fields[field[0]].update(slug=slug)
            if field[0] != self.tree.objective_id:
                args.append("\"" + self.fields[field[0]]['slug'] + "\"")
        output = \
u"""#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import csv
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')


class CSVInput(object):
    \"\"\"Reads and parses csv input from stdin

       Expects a data section (without headers) with the following fields:
       %s

       Data is processed to fall into the corresponding input type by applying
       INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can
       also provide strings to be considered as no content markers in
       MISSING_TOKENS.
    \"\"\"
    def __init__(self, input=sys.stdin):
        \"\"\" Opens stdin and defines parsing constants

        \"\"\"
        try:
            self.reader = csv.reader(input, delimiter=',', quotechar='\"')
""" % ",".join(parameters)

        output += (u"\n%sself.INPUT_FIELDS = [%s]\n" %
                   ((INDENT * 3), (",\n " + INDENT * 8).join(args)))

        input_types = []
        prefixes = []
        suffixes = []
        count = 0
        fields = self.fields
        for key in [
                key[0] for key in input_fields if key != self.tree.objective_id
        ]:
            input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV
                          else PYTHON_CONV[fields[key]['datatype']])
            input_types.append(input_type)
            if 'prefix' in fields[key]:
                prefixes.append("%s: %s" %
                                (count, repr(fields[key]['prefix'])))
            if 'suffix' in fields[key]:
                suffixes.append("%s: %s" %
                                (count, repr(fields[key]['suffix'])))
            count += 1
        static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(input_types),
                                 "]\n")
        static_content = "%sself.PREFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(prefixes),
                                 "}\n")
        static_content = "%sself.SUFFIXES = {" % (INDENT * 3)
        formatter = ",\n%s" % (" " * len(static_content))
        output += u"\n%s%s%s" % (static_content, formatter.join(suffixes),
                                 "}\n")
        output += \
u"""            self.MISSING_TOKENS = ['?']
        except Exception, exc:
            sys.stderr.write(\"Cannot read csv\"
                             \" input. %s\\n\" % str(exc))

    def __iter__(self):
        \"\"\" Iterator method

        \"\"\"
        return self

    def next(self):
        \"\"\" Returns processed data in a list structure

        \"\"\"
        def normalize(value):
            \"\"\"Transforms to unicode and cleans missing tokens
            \"\"\"
            value = unicode(value.decode('utf-8'))
            return \"\" if value in self.MISSING_TOKENS else value

        def cast(function_value):
            \"\"\"Type related transformations
            \"\"\"
            function, value = function_value
            if not len(value):
                return None
            if function is None:
                return value
            else:
                return function(value)

        try:
            values = self.reader.next()
        except StopIteration:
            raise StopIteration()
        if len(values) < len(self.INPUT_FIELDS):
            sys.stderr.write(\"Found %s fields when %s were expected.\\n\" %
                             (len(values), len(self.INPUT_FIELDS)))
            raise StopIteration()
        else:
            values = values[0:len(self.INPUT_FIELDS)]
        try:
            values = map(normalize, values)
            for key in self.PREFIXES:
                prefix_len = len(self.PREFIXES[key])
                if values[key][0:prefix_len] == self.PREFIXES[key]:
                    values[key] = values[key][prefix_len:]
            for key in self.SUFFIXES:
                suffix_len = len(self.SUFFIXES[key])
                if values[key][-suffix_len:] == self.SUFFIXES[key]:
                    values[key] = values[key][0:-suffix_len]
            function_tuples = zip(self.INPUT_TYPES, values)
            values = map(cast, function_tuples)
            data = {}
            for i in range(len(values)):
                data.update({self.INPUT_FIELDS[i]: values[i]})
            return data
        except Exception, exc:
            sys.stderr.write(\"Error in data transformations. %s\\n\" %
                             str(exc))
            return False
\n\n
"""
        out.write(utf8(output))
        out.flush()

        self.tree.python(out,
                         self.docstring(),
                         input_map=True,
                         ids_path=ids_path,
                         subtree=subtree)
        output = \
u"""
csv = CSVInput()
for values in csv:
    if not isinstance(values, bool):
        print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values)))
\n\n
""" % fields[self.tree.objective_id]['slug']
        out.write(utf8(output))
        out.flush()
Example #44
0
    def summarize(self, out=sys.stdout):
        """Prints summary grouping distribution as class header and details

        """
        tree = self.tree

        def extract_common_path(groups):
            """Extracts the common segment of the prediction path for a group

            """
            for group in groups:
                details = groups[group]['details']
                common_path = []
                if len(details) > 0:
                    mcd_len = min([len(x[0]) for x in details])
                    for i in range(0, mcd_len):
                        test_common_path = details[0][0][i]
                        for subgroup in details:
                            if subgroup[0][i] != test_common_path:
                                i = mcd_len
                                break
                        if i < mcd_len:
                            common_path.append(test_common_path)
                groups[group]['total'][0] = common_path
                if len(details) > 0:
                    groups[group]['details'] = sorted(details,
                                                      key=lambda x: x[1],
                                                      reverse=True)

        def confidence_error(value):
            """Returns confidence for categoric objective fields
               and error for numeric objective fields
            """
            if value is None:
                return ""
            objective_type = self.fields[tree.objective_id]['optype']
            if objective_type == 'numeric':
                return u" [Error: %s]" % value
            else:
                return u" [Confidence: %.2f%%]" % (round(value, 4) * 100)

        distribution = self.get_data_distribution()

        out.write(u"Data distribution:\n")
        print_distribution(distribution, out=out)
        out.write(u"\n\n")

        groups = self.group_prediction()
        predictions = self.get_prediction_distribution(groups)

        out.write(u"Predicted distribution:\n")
        print_distribution(predictions, out=out)
        out.write(u"\n\n")

        if self.field_importance:
            out.write(u"Field importance:\n")
            print_importance(self, out=out)

        extract_common_path(groups)

        for group in [x[0] for x in predictions]:
            details = groups[group]['details']
            path = [
                prediction.to_rule(self.fields)
                for prediction in groups[group]['total'][0]
            ]
            data_per_group = groups[group]['total'][1] * 1.0 / tree.count
            pred_per_group = groups[group]['total'][2] * 1.0 / tree.count
            out.write(
                utf8(u"\n\n%s : (data %.2f%% / prediction %.2f%%) %s\n" %
                     (group, round(data_per_group, 4) * 100,
                      round(pred_per_group, 4) * 100, " and ".join(path))))

            if len(details) == 0:
                out.write(u"    The model will never predict this class\n")
            for j in range(0, len(details)):
                subgroup = details[j]
                pred_per_sgroup = subgroup[1] * 1.0 / groups[group]['total'][2]
                path = [
                    prediction.to_rule(self.fields)
                    for prediction in subgroup[0]
                ]
                path_chain = " and ".join(path) if len(path) else "(root node)"
                out.write(
                    utf8(u"    · %.2f%%: %s%s\n" %
                         (round(pred_per_sgroup, 4) * 100, path_chain,
                          confidence_error(subgroup[2]))))
        out.flush()
Example #45
0
def hadoop_python_mapper(model, out=sys.stdout, ids_path=None, subtree=True):
    """Generates a hadoop mapper header to make predictions in python

    """
    input_fields = [(value, key) for (key, value) in sorted(
        list(model.inverted_fields.items()), key=lambda x: x[1])]
    parameters = [
        value for (key, value) in input_fields if key != model.objective_id
    ]
    args = []
    for field in input_fields:
        slug = slugify(model.fields[field[0]]['name'])
        model.fields[field[0]].update(slug=slug)
        if field[0] != model.objective_id:
            args.append("\"" + model.fields[field[0]]['slug'] + "\"")

    with open(HADOOP_CSV_TEMPLATE) as template_handler:
        output = template_handler.read() % ",".join(parameters)

    output += "\n%sself.INPUT_FIELDS = [%s]\n" % \
        ((INDENT * 3), (",\n " + INDENT * 8).join(args))

    input_types = []
    prefixes = []
    suffixes = []
    count = 0
    fields = model.fields
    for key in [
            field[0] for field in input_fields
            if field[0] != model.objective_id
    ]:
        input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else
                      PYTHON_CONV[fields[key]['datatype']])
        input_types.append(input_type)
        if 'prefix' in fields[key]:
            prefixes.append("%s: %s" % (count, repr(fields[key]['prefix'])))
        if 'suffix' in fields[key]:
            suffixes.append("%s: %s" % (count, repr(fields[key]['suffix'])))
        count += 1
    static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3)
    formatter = ",\n%s" % (" " * len(static_content))
    output += "\n%s%s%s" % (static_content, formatter.join(input_types), "]\n")
    static_content = "%sself.PREFIXES = {" % (INDENT * 3)
    formatter = ",\n%s" % (" " * len(static_content))
    output += "\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n")
    static_content = "%sself.SUFFIXES = {" % (INDENT * 3)
    formatter = ",\n%s" % (" " * len(static_content))
    output += "\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n")

    with open(HADOOP_NEXT_TEMPLATE) as template_handler:
        output += template_handler.read()

    out.write(output)
    out.flush()

    tree_python(model.tree,
                model.offsets,
                model.fields,
                model.objective_id,
                False if not hasattr(model, "boosting") else model.boosting,
                out,
                docstring(model),
                ids_path=ids_path,
                subtree=subtree)

    output = \
"""
csv = CSVInput()
for values in csv:
    if not isinstance(values, bool):
        print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values)))
\n\n
""" % fields[model.objective_id]['slug']
    out.write(utf8(output))
    out.flush()
Example #46
0
File: model.py Project: rmmx/python
    def summarize(self, out=sys.stdout):
        """Prints summary grouping distribution as class header and details

        """
        tree = self.tree

        def extract_common_path(groups):
            """Extracts the common segment of the prediction path for a group

            """
            for group in groups:
                details = groups[group]['details']
                common_path = []
                if len(details) > 0:
                    mcd_len = min([len(x[0]) for x in details])
                    for i in range(0, mcd_len):
                        test_common_path = details[0][0][i]
                        for subgroup in details:
                            if subgroup[0][i] != test_common_path:
                                i = mcd_len
                                break
                        if i < mcd_len:
                            common_path.append(test_common_path)
                groups[group]['total'][0] = common_path
                if len(details) > 0:
                    groups[group]['details'] = sorted(details,
                                                      key=lambda x: x[1],
                                                      reverse=True)

        def confidence_error(value, impurity=None):
            """Returns confidence for categoric objective fields
               and error for numeric objective fields
            """
            if value is None:
                return ""
            impurity_literal = ""
            if impurity is not None and impurity > 0:
                impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4))
            objective_type = self.fields[tree.objective_id]['optype']
            if objective_type == 'numeric':
                return u" [Error: %s]" % value
            else:
                return u" [Confidence: %.2f%%%s]" % ((round(value, 4) * 100),
                                                     impurity_literal)

        distribution = self.get_data_distribution()

        out.write(u"Data distribution:\n")
        print_distribution(distribution, out=out)
        out.write(u"\n\n")

        groups = self.group_prediction()
        predictions = self.get_prediction_distribution(groups)

        out.write(u"Predicted distribution:\n")
        print_distribution(predictions, out=out)
        out.write(u"\n\n")

        if self.field_importance:
            out.write(u"Field importance:\n")
            print_importance(self, out=out)

        extract_common_path(groups)

        for group in [x[0] for x in predictions]:
            details = groups[group]['details']
            path = [prediction.to_rule(self.fields) for
                    prediction in groups[group]['total'][0]]
            data_per_group = groups[group]['total'][1] * 1.0 / tree.count
            pred_per_group = groups[group]['total'][2] * 1.0 / tree.count
            out.write(utf8(u"\n\n%s : (data %.2f%% / prediction %.2f%%) %s\n" %
                           (group,
                            round(data_per_group, 4) * 100,
                            round(pred_per_group, 4) * 100,
                            " and ".join(path))))

            if len(details) == 0:
                out.write(u"    The model will never predict this class\n")
            for j in range(0, len(details)):
                subgroup = details[j]
                pred_per_sgroup = subgroup[1] * 1.0 / groups[group]['total'][2]
                path = [prediction.to_rule(self.fields) for
                        prediction in subgroup[0]]
                path_chain = " and ".join(path) if len(path) else "(root node)"
                out.write(utf8(u"    · %.2f%%: %s%s\n" %
                               (round(pred_per_sgroup, 4) * 100,
                                path_chain,
                                confidence_error(subgroup[2],
                                                 impurity=subgroup[3]))))
        out.flush()
Example #47
0
    def predict(self, input_data, by_name=True,
                print_path=False, out=sys.stdout, with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                add_min=False,
                add_max=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        add_min: Boolean, if True adds the minimum value in the prediction's
                 distribution (for regressions only)
        add_max: Boolean, if True adds the maximum value in the prediction's
                 distribution (for regressions only)
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL and
                not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' %
                           prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [prediction.output,
                      prediction.confidence,
                      prediction.distribution,
                      prediction.count,
                      prediction.median]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all') or
                        (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction': category,
                        'confidence': ws_confidence(category,
                                                    prediction.distribution),
                        'probability': instances / total_instances,
                        'count': instances}
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count or
                    add_median or add_next or add_min or add_max):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update(
                        {'distribution': prediction.distribution,
                         'distribution_unit': prediction.distribution_unit})
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})
                if self.tree.regression and add_min:
                    output.update({'min': prediction.min})
                if self.tree.regression and add_max:
                    output.update({'max': prediction.max})

        return output
Example #48
0
    def predict(self,
                input_data,
                by_name=True,
                print_path=False,
                out=sys.stdout,
                with_confidence=False,
                missing_strategy=LAST_PREDICTION,
                add_confidence=False,
                add_path=False,
                add_distribution=False,
                add_count=False,
                add_median=False,
                add_next=False,
                multiple=None):
        """Makes a prediction based on a number of field values.

        By default the input fields must be keyed by field name but you can use
        `by_name` to input them directly keyed by id.

        input_data: Input data to be predicted
        by_name: Boolean, True if input_data is keyed by names
        print_path: Boolean, if True the rules that lead to the prediction
                    are printed
        out: output handler
        with_confidence: Boolean, if True, all the information in the node
                         (prediction, confidence, distribution and count)
                         is returned in a list format
        missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                          missing fields
        add_confidence: Boolean, if True adds confidence to the dict output
        add_path: Boolean, if True adds path to the dict output
        add_distribution: Boolean, if True adds distribution info to the
                          dict output
        add_count: Boolean, if True adds the number of instances in the
                       node to the dict output
        add_median: Boolean, if True adds the median of the values in
                    the distribution
        add_next: Boolean, if True adds the field that determines next
                  split in the tree
        multiple: For categorical fields, it will return the categories
                  in the distribution of the predicted node as a
                  list of dicts:
                    [{'prediction': 'Iris-setosa',
                      'confidence': 0.9154
                      'probability': 0.97
                      'count': 97},
                     {'prediction': 'Iris-virginica',
                      'confidence': 0.0103
                      'probability': 0.03,
                      'count': 3}]
                  The value of this argument can either be an integer
                  (maximum number of categories to be returned), or the
                  literal 'all', that will cause the entire distribution
                  in the node to be returned.

        """
        # Checks if this is a regression model, using PROPORTIONAL
        # missing_strategy
        if (self.tree.regression and missing_strategy == PROPORTIONAL
                and not self.regression_ready):
            raise ImportError("Failed to find the numpy and scipy libraries,"
                              " needed to use proportional missing strategy"
                              " for regressions. Please install them before"
                              " using local predictions for the model.")
        # Checks and cleans input_data leaving the fields used in the model
        input_data = self.filter_input_data(input_data, by_name=by_name)

        # Strips affixes for numeric values and casts to the final field type
        cast(input_data, self.fields)

        prediction = self.tree.predict(input_data,
                                       missing_strategy=missing_strategy)

        # Prediction path
        if print_path:
            out.write(
                utf8(u' AND '.join(prediction.path) +
                     u' => %s \n' % prediction.output))
            out.flush()
        output = prediction.output
        if with_confidence:
            output = [
                prediction.output, prediction.confidence,
                prediction.distribution, prediction.count, prediction.median
            ]
        if multiple is not None and not self.tree.regression:
            output = []
            total_instances = float(prediction.count)
            distribution = enumerate(prediction.distribution)
            for index, [category, instances] in distribution:
                if ((isinstance(multiple, basestring) and multiple == 'all')
                        or (isinstance(multiple, int) and index < multiple)):
                    prediction_dict = {
                        'prediction':
                        category,
                        'confidence':
                        ws_confidence(category, prediction.distribution),
                        'probability':
                        instances / total_instances,
                        'count':
                        instances
                    }
                    output.append(prediction_dict)
        else:
            if (add_confidence or add_path or add_distribution or add_count
                    or add_median or add_next):
                output = {'prediction': prediction.output}
                if add_confidence:
                    output.update({'confidence': prediction.confidence})
                if add_path:
                    output.update({'path': prediction.path})
                if add_distribution:
                    output.update({
                        'distribution':
                        prediction.distribution,
                        'distribution_unit':
                        prediction.distribution_unit
                    })
                if add_count:
                    output.update({'count': prediction.count})
                if self.tree.regression and add_median:
                    output.update({'median': prediction.median})
                if add_next:
                    field = (None if len(prediction.children) == 0 else
                             prediction.children[0].predicate.field)
                    if field is not None and field in self.fields:
                        field = self.fields[field]['name']
                    output.update({'next': field})

        return output
Example #49
0
def tree_python(tree,
                offsets,
                fields,
                objective_id,
                boosting,
                out,
                docstring_str,
                input_map=False,
                ids_path=None,
                subtree=True):
    """Writes a python function that implements the model.

    """
    args = []
    args_tree = []
    parameters = sort_fields(fields)
    if not input_map:
        input_map = len(parameters) > MAX_ARGS_LENGTH
    reserved_keywords = keyword.kwlist if not input_map else None
    prefix = "_" if not input_map else ""
    for field in parameters:
        field_name_to_show = fields[field[0]]['name'].strip()
        if field_name_to_show == "":
            field_name_to_show = field[0]
        slug = slugify(field_name_to_show,
                       reserved_keywords=reserved_keywords,
                       prefix=prefix)
        fields[field[0]].update(slug=slug)
        if not input_map:
            if field[0] != objective_id:
                args.append("%s=None" % (slug))
                args_tree.append("%s=%s" % (slug, slug))
    if input_map:
        args.append("data={}")
        args_tree.append("data=data")

    function_name = fields[objective_id]['slug'] if \
        not boosting else fields[boosting["objective_field"]]['slug']
    if prefix == "_" and function_name[0] == prefix:
        function_name = function_name[1:]
    if function_name == "":
        function_name = "field_" + objective_id
    python_header = "# -*- coding: utf-8 -*-\n"
    predictor_definition = ("def predict_%s" % function_name)
    depth = len(predictor_definition) + 1
    predictor = "%s(%s):\n" % (predictor_definition,
                               (",\n" + " " * depth).join(args))

    predictor_doc = (INDENT + "\"\"\" " + docstring_str + "\n" + INDENT +
                     "\"\"\"\n")
    body_fn = boosted_plug_in_body if boosting else plug_in_body
    body, term_analysis_predicates, item_analysis_predicates = \
        body_fn(tree, offsets, fields, objective_id,
                fields[objective_id]["optype"] == NUMERIC,
                input_map=input_map,
                ids_path=ids_path, subtree=subtree)
    terms_body = ""
    if term_analysis_predicates or item_analysis_predicates:
        terms_body = term_analysis_body(fields, term_analysis_predicates,
                                        item_analysis_predicates)
    predictor = python_header + predictor + \
        predictor_doc + terms_body + body

    predictor_model = "def predict"
    depth = len(predictor_model) + 1
    predictor += "\n\n%s(%s):\n" % (predictor_model,
                                    (",\n" + " " * depth).join(args))
    predictor += "%sprediction = predict_%s(%s)\n" % ( \
        INDENT, function_name, ", ".join(args_tree))

    if boosting is not None:
        predictor += "%sprediction.update({\"weight\": %s})\n" % \
            (INDENT, boosting.get("weight"))
        if boosting.get("objective_class") is not None:
            predictor += "%sprediction.update({\"class\": \"%s\"})\n" % \
                (INDENT, boosting.get("objective_class"))
    predictor += "%sreturn prediction" % INDENT

    out.write(utf8(predictor))
    out.flush()