def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ out.write(u"Cluster of %s centroids\n\n" % len(self.centroids)) out.write(u"Data distribution:\n") print_distribution(self.get_data_distribution(), out=out) out.write(u"\n\n") centroids_list = sorted(self.centroids, key=lambda x: x.name) out.write(u"Centroids features:\n") for centroid in centroids_list: out.write(utf8(u"\n%s: " % centroid.name)) connector = "" for field_id, value in centroid.center.items(): if isinstance(value, basestring): value = u"\"%s\"" % value out.write( utf8(u"%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write(u"\n\n") out.write(u"Data distance statistics:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) out.write(u"Intercentroids distance:\n\n") for centroid in centroids_list: out.write(utf8(u"To centroid: %s\n" % centroid.name)) for measure, result in self.centroids_distance(centroid): out.write(u"%s%s: %s\n" % (INDENT, measure, result)) out.write(u"\n")
def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ out.write(u"Cluster of %s centroids\n\n" % len(self.centroids)) out.write(u"Data distribution:\n") print_distribution(self.get_data_distribution(), out=out) out.write(u"\n\n") centroids_list = sorted(self.centroids, key=lambda x: x.name) out.write(u"Centroids features:\n") for centroid in centroids_list: out.write(utf8(u"\n%s: " % centroid.name)) connector = "" for field_id, value in centroid.center.items(): if isinstance(value, basestring): value = u"\"%s\"" % value out.write(utf8(u"%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write(u"\n\n") out.write(u"Data distance statistics:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) if len(self.centroids) > 1: out.write(u"Intercentroids distance:\n\n") for centroid in centroids_list: out.write(utf8(u"To centroid: %s\n" % centroid.name)) for measure, result in self.centroids_distance(centroid): out.write(u"%s%s: %s\n" % (INDENT, measure, result)) out.write(u"\n")
def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ report_header = '' if self.is_g_means: report_header = \ u'G-means Cluster (critical_value=%d)' % self.critical_value else: report_header = u'K-means Cluster (k=%d)' % self.k out.write(report_header + ' with %d centroids\n\n' % len(self.centroids)) out.write(u"Data distribution:\n") # "Global" is set as first entry self.print_global_distribution(out=out) print_distribution(self.get_data_distribution(), out=out) out.write(u"\n") centroids_list = [self.cluster_global] if self.cluster_global else [] centroids_list.extend(sorted(self.centroids, key=lambda x: x.name)) out.write(u"Cluster metrics:\n") self.print_ss_metrics(out=out) out.write(u"\n") out.write(u"Centroids:\n") for centroid in centroids_list: out.write(utf8(u"\n%s%s: " % (INDENT, centroid.name))) connector = "" for field_id, value in centroid.center.items(): if isinstance(value, basestring): value = u"\"%s\"" % value out.write(utf8(u"%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write(u"\n\n") out.write(u"Distance distribution:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) out.write(u"\n") if len(self.centroids) > 1: out.write(u"Intercentroid distance:\n\n") centroids_list = (centroids_list[1:] if self.cluster_global else centroids_list) for centroid in centroids_list: out.write(utf8(u"%sTo centroid: %s\n" % (INDENT, centroid.name))) for measure, result in self.centroids_distance(centroid): out.write(u"%s%s: %s\n" % (INDENT * 2, measure, result)) out.write(u"\n")
def summarize(self, out=sys.stdout): """Prints a summary of the cluster info """ report_header = '' if self.is_g_means: report_header = \ 'G-means Cluster (critical_value=%d)' % self.critical_value else: report_header = 'K-means Cluster (k=%d)' % self.k out.write(report_header + ' with %d centroids\n\n' % len(self.centroids)) out.write("Data distribution:\n") # "Global" is set as first entry self.print_global_distribution(out=out) print_distribution(self.get_data_distribution(), out=out) out.write("\n") centroids_list = [self.cluster_global] if self.cluster_global else [] centroids_list.extend(sorted(self.centroids, key=lambda x: x.name)) out.write("Cluster metrics:\n") self.print_ss_metrics(out=out) out.write("\n") out.write("Centroids:\n") for centroid in centroids_list: out.write(utf8("\n%s%s: " % (INDENT, centroid.name))) connector = "" for field_id, value in list(centroid.center.items()): if isinstance(value, str): value = "\"%s\"" % value out.write(utf8("%s%s: %s" % (connector, self.fields[field_id]['name'], value))) connector = ", " out.write("\n\n") out.write("Distance distribution:\n\n") for centroid in centroids_list: centroid.print_statistics(out=out) out.write("\n") if len(self.centroids) > 1: out.write("Intercentroid distance:\n\n") centroids_list = (centroids_list[1:] if self.cluster_global else centroids_list) for centroid in centroids_list: out.write(utf8("%sTo centroid: %s\n" % (INDENT, centroid.name))) for measure, result in self.centroids_distance(centroid): out.write("%s%s: %s\n" % (INDENT * 2, measure, result)) out.write("\n")
def list_fields(self, out): """Lists a description of the model's fields. """ out.write(utf8(u'<%-32s : %s>\n' % ( self.fields[self.objective_field]['name'], self.fields[self.objective_field]['optype']))) out.flush() for field in [(val['name'], val['optype']) for key, val in sort_fields(self.fields) if key != self.objective_field]: out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1]))) out.flush() return self.fields
def list_fields(self, out): """Lists a description of the model's fields. """ out.write(utf8(u'<%-32s : %s>\n' % ( self.fields[self.objective_id]['name'], self.fields[self.objective_id]['optype']))) out.flush() for field in [(val['name'], val['optype']) for key, val in sort_fields(self.fields) if key != self.objective_id]: out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1]))) out.flush() return self.fields
def list_fields(model, out=sys.stdout): """Prints descriptions of the fields for this model. """ out.write( utf8('<%-32s : %s>\n' % (model.fields[model.objective_id]['name'], model.fields[model.objective_id]['optype']))) out.flush() for field in [(val['name'], val['optype']) for key, val in sort_fields(model.fields) if key != model.objective_id]: out.write(utf8('[%-32s : %s]\n' % (field[0], field[1]))) out.flush() return model.fields
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def python(self, out, docstring, input_map=False): """Writes a python function that implements the model. """ args = [] parameters = sort_fields(self.fields) if not input_map: input_map = len(parameters) > MAX_ARGS_LENGTH reserved_keywords = keyword.kwlist if not input_map else None prefix = "_" if not input_map else "" for field in [(key, val) for key, val in parameters]: slug = slugify(self.fields[field[0]]['name'], reserved_keywords=reserved_keywords, prefix=prefix) self.fields[field[0]].update(slug=slug) if not input_map: if field[0] != self.objective_field: args.append("%s=None" % (slug)) if input_map: args.append("data={}") predictor_definition = (u"def predict_%s" % self.fields[self.objective_field]['slug']) depth = len(predictor_definition) + 1 predictor = u"%s(%s):\n" % (predictor_definition, (",\n" + " " * depth).join(args)) predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT + u"\"\"\"\n") body, term_analysis_predicates = self.python_body(input_map=input_map) terms_body = "" if term_analysis_predicates: terms_body = self.term_analysis_body(term_analysis_predicates) predictor += predictor_doc + terms_body + body out.write(utf8(predictor)) out.flush()
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def hadoop_python_reducer(self, out=sys.stdout): """Returns a hadoop reducer to make predictions in python """ output = \ u"""#!/usr/bin/env python # -*- coding: utf-8 -*- import sys count = 0 previous = None def print_result(values, prediction, count): \"\"\"Prints input data and predicted value as an ordered list. \"\"\" result = \"[%s, %s]\" % (values, prediction) print u\"%s\\t%s\" % (result, count) for line in sys.stdin: values, prediction = line.strip().split('\\t') if previous is None: previous = (values, prediction) if values != previous[0]: print_result(previous[0], previous[1], count) previous = (values, prediction) count = 0 count += 1 if count > 0: print_result(previous[0], previous[1], count) """ out.write(utf8(output)) out.flush()
def python(self, out, docstring, input_map=False): """Writes a python function that implements the model. """ args = [] parameters = sort_fields(self.fields) if not input_map: input_map = len(parameters) > MAX_ARGS_LENGTH for field in [(key, val) for key, val in parameters]: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if not input_map: if field[0] != self.objective_field: args.append("%s=None" % (slug)) if input_map: args.append("data={}") predictor_definition = (u"def predict_%s" % self.fields[self.objective_field]['slug']) depth = len(predictor_definition) + 1 predictor = u"%s(%s):\n" % (predictor_definition, (",\n" + " " * depth).join(args)) predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT + u"\"\"\"\n") predictor += predictor_doc + self.python_body(input_map=input_map) out.write(utf8(predictor)) out.flush()
def i_check_the_data_distribution(step, file): distribution = g.get_data_distribution(world.local_model) distribution_str = '' for bin_value, bin_instances in distribution: distribution_str += "[%s,%s]\n" % (bin_value, bin_instances) world.output = utf8(distribution_str) i_check_if_the_output_is_like_expected_file(step, file)
def print_importance(instance, out=sys.stdout): """Print a field importance structure """ count = 1 field_importance, fields = instance.field_importance_data() for [field, importance] in field_importance: out.write(utf8(u" %s. %s: %.2f%%\n" % (count, fields[field]["name"], round(importance, 4) * 100))) count += 1
def hadoop_python_reducer(out=sys.stdout): """Generates a hadoop reducer to make predictions in python """ with open(HADOOP_REDUCER_TEMPLATE) as template_handler: output = template_handler.read() out.write(utf8(output)) out.flush()
def print_importance(out=sys.stdout): """Prints field importance """ count = 1 for [field, importance] in self.field_importance: out.write(utf8(u" %s. %s: %.2f%%\n" % (count, self.tree.fields[field]['name'], round(importance, 4) * 100))) count += 1
def rules(self, out): """Prints out an IF-THEN rule version of the tree. """ for field in [(key, val) for key, val in sort_fields(self.fields)]: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) out.write(utf8(self.generate_rules())) out.flush()
def list_fields(self, out): """Lists a description of the model's fields. """ for field in [(val['name'], val['optype']) for _, val in sort_fields(self.fields)]: out.write(utf8(u'[%-32s : %s]\n' % (field[0], field[1]))) out.flush() return self.fields
def print_distribution(distribution, out=sys.stdout): """Prints distribution data """ total = reduce(lambda x, y: x + y, [group[1] for group in distribution]) for group in distribution: out.write( utf8(u" %s: %.2f%% (%d instance%s)\n" % (group[0], round(group[1] * 1.0 / total, 4) * 100, group[1], "" if group[1] == 1 else "s")))
def tableau(self, out, ids_path=None, subtree=True): """Writes a Tableau function that implements the model. """ body = self.tableau_body(ids_path=ids_path, subtree=subtree) if not body: return False out.write(utf8(body)) out.flush() return True
def i_check_the_predictions_distribution(step, file): predictions = g.get_prediction_distribution(world.local_model) distribution_str = '' for group, instances in predictions: distribution_str += "[%s,%s]\n" % (group, instances) world.output = utf8(distribution_str) i_check_if_the_output_is_like_expected_file(step, file)
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction_info = self.tree.predict(input_data, missing_strategy=missing_strategy) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() output = prediction if with_confidence: output = [prediction, confidence, distribution, instances] if add_confidence or add_path or add_distribution or add_count: output = {'prediction': prediction} if add_confidence: output.update({'confidence': confidence}) if add_path: rules = path output.update({'path': rules}) if add_distribution: output.update({'distribution': distribution}) if add_count: output.update({'count': instances}) return output
def python(self, out=sys.stdout): """Generates the code in python that creates the forecasts """ attributes = [u"l", u"b", u"s", u"phi", u"value", u"slope"] components = {} model_components = {} model_names = [] out.write( utf8(USAGE_DOC % (self.resource_id, self.fields[self.objective_id]["name"]))) output = [u"COMPONENTS = \\"] for field_id, models in self.ets_models.items(): for model in models: final_state = model.get("final_state", {}) attrs = {} for attribute in attributes: if attribute in model: attrs.update({attribute: model[attribute]}) elif attribute in final_state: attrs.update( \ {attribute: final_state[attribute]}) model_names.append(model["name"]) model_components[model["name"]] = attrs field_name = self.fields[field_id]["name"] if field_name not in components: components[field_name] = model_components partial_output = StringIO.StringIO() pprint.pprint(components, stream=partial_output) for line in partial_output.getvalue().split("\n"): output.append(u"%s%s" % (INDENT, line)) out.write(utf8(u"\n".join(output))) model_names = list(set(model_names)) if any(name in model_names for name in ["naive", "mean"]): out.write(utf8(TRIVIAL_MODEL)) if any("," in name and name.split(",")[2] in ["A", "M"] for \ name in model_names): out.write(utf8(SEASONAL_CODE)) trends = [name.split(",")[1] for name in model_names if "," in name] trends.extend([name for name in model_names if "," not in name]) trends = set(trends) models_function = [] for trend in trends: models_function.append("\"%s\": _%s_forecast" % (trend, trend)) out.write(utf8(SUBMODELS_CODE[trend])) out.write(utf8(u"\n\nMODELS = \\\n")) out.write(utf8("%s%s%s" % \ (u" {", u",\n ".join(models_function), u"}"))) out.write(utf8(FORECAST_FUNCTION))
def print_distribution(distribution, out=sys.stdout): """Prints distribution data """ total = reduce(lambda x, y: x + y, [group[1] for group in distribution]) for group in distribution: out.write(utf8(u" %s: %.2f%% (%d instance%s)\n" % (group[0], round(group[1] * 1.0 / total, 4) * 100, group[1], "" if group[1] == 1 else "s")))
def print_importance(instance, out=sys.stdout): """Print a field importance structure """ count = 1 field_importance, fields = instance.field_importance_data() for [field, importance] in field_importance: out.write( utf8(u" %s. %s: %.2f%%\n" % (count, fields[field]['name'], round(importance, 4) * 100))) count += 1
def python(self, out=sys.stdout): """Generates the code in python that creates the forecasts """ attributes = [u"l", u"b", u"s", u"phi", u"value", u"slope"] components = {} model_components = {} model_names = [] out.write(utf8(USAGE_DOC % (self.resource_id, self.fields[self.objective_id]["name"]))) output = [u"COMPONENTS = \\"] for field_id, models in self.ets_models.items(): for model in models: final_state = model.get("final_state", {}) attrs = {} for attribute in attributes: if attribute in model: attrs.update({attribute: model[attribute]}) elif attribute in final_state: attrs.update( \ {attribute: final_state[attribute]}) model_names.append(model["name"]) model_components[model["name"]] = attrs field_name = self.fields[field_id]["name"] if field_name not in components: components[field_name] = model_components partial_output = StringIO.StringIO() pprint.pprint(components, stream=partial_output) for line in partial_output.getvalue().split("\n"): output.append(u"%s%s" % (INDENT, line)) out.write(utf8(u"\n".join(output))) model_names = set(model_names) if (any(name in model_names for name in ["naive", "mean"])): out.write(utf8(TRIVIAL_MODEL)) if (any("," in name and name.split(",")[2] in ["A", "M"] for \ name in model_names)): out.write(utf8(SEASONAL_CODE)) trends = [name.split(",")[1] for name in model_names if "," in name] trends.extend([name for name in model_names if "," not in name]) trends = set(trends) models_function = [] for trend in trends: models_function.append("\"%s\": _%s_forecast" % (trend, trend)) out.write(utf8(SUBMODELS_CODE[trend])) out.write(utf8(u"\n\nMODELS = \\\n")) out.write(utf8("%s%s%s" % \ (u" {", u",\n ".join(models_function), u"}"))) out.write(utf8(FORECAST_FUNCTION))
def centroid_features(centroid, field_ids, encode=True): """Returns features defining the centroid according to the list of common field ids that define the centroids. """ features = [] for field_id in field_ids: value = centroid.center[field_id] if isinstance(value, str) and encode: value = utf8(value) features.append(value) return features
def tree_csv(model, file_name=None, leaves_only=False): """Outputs the node structure to a CSV file or array """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") headers_names = [] if model.regression: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("error") max_bins = get_node(model.tree)[model.offsets["max_bins"]] for index in range(0, max_bins): headers_names.append("bin%s_value" % index) headers_names.append("bin%s_instances" % index) else: headers_names.append(model.fields[model.objective_id]['name']) headers_names.append("confidence") headers_names.append("impurity") node = get_node(model.tree) for category, _ in node[model.offsets["distribution"]]: headers_names.append(category) nodes_generator = get_nodes_info(model, headers_names, leaves_only=leaves_only) if file_name is not None: with UnicodeWriter(file_name) as writer: writer.writerow([utf8(header) for header in headers_names]) for row in nodes_generator: writer.writerow([ item if not isinstance(item, str) else utf8(item) for item in row ]) return file_name rows = [] rows.append(headers_names) for row in nodes_generator: rows.append(row) return rows
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ empty_fields = [(key, value) for (key, value) in input_data.items() if value is None] for (key, value) in empty_fields: del input_data[key] if by_name: wrong_keys = [key for key in input_data.keys() if not key in self.all_inverted_fields] if wrong_keys: LOGGER.error("Wrong field names in input data: %s" % ", ".join(wrong_keys)) input_data = dict( [[self.inverted_fields[key], value] for key, value in input_data.items() if key in self.inverted_fields]) for (key, value) in input_data.items(): if ((self.tree.fields[key]['optype'] == 'numeric' and isinstance(value, basestring)) or ( self.tree.fields[key]['optype'] != 'numeric' and not isinstance(value, basestring))): try: input_data.update({key: map_type(self.tree.fields[key] ['optype'])(value)}) except: raise Exception(u"Mismatch input data type in field " u"\"%s\" for value %s." % (self.tree.fields[key]['name'], value)) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Strips None values empty_fields = [(key, value) for (key, value) in input_data.items() if value is None] for (key, value) in empty_fields: del input_data[key] # Checks input_data keys against field names and filters the ones # used in the model if by_name: wrong_keys = [key for key in input_data.keys() if not key in self.all_inverted_fields] if wrong_keys: LOGGER.error("Wrong field names in input data: %s" % ", ".join(wrong_keys)) input_data = dict( [[self.inverted_fields[key], value] for key, value in input_data.items() if key in self.inverted_fields]) else: input_data = dict( [[key, value] for key, value in input_data.items() if key in self.tree.fields]) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.tree.fields) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def tree_rules(tree, offsets, objective_id, fields, out, ids_path=None, subtree=True): """Prints out an IF-THEN rule version of the tree. """ for field in sort_fields(fields): slug = slugify(fields[field[0]]['name']) fields[field[0]].update(slug=slug) out.write( utf8( generate_rules(tree, offsets, objective_id, fields, ids_path=ids_path, subtree=subtree))) out.flush()
def tree_tableau(tree, offsets, fields, objective_id, out, ids_path=None, subtree=True, attr=DFT_ATTR): """Writes a Tableau function that implements the model. """ body = tableau_body(tree, offsets, fields, objective_id, ids_path=ids_path, subtree=subtree, attr=attr) if not body: return False out.write(utf8(body)) out.flush() return True
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. """ # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.tree.fields) prediction_info = self.tree.predict(input_data) prediction, path, confidence, distribution, instances = prediction_info # Prediction path if print_path: out.write(utf8(u' AND '.join(path) + u' => %s \n' % prediction)) out.flush() if with_confidence: return [prediction, confidence, distribution, instances] return prediction
def hadoop_python_mapper(self, out=sys.stdout, ids_path=None, subtree=True): """Returns a hadoop mapper header to make predictions in python """ input_fields = [(value, key) for (key, value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])] parameters = [value for (key, value) in input_fields if key != self.tree.objective_id] args = [] for field in input_fields: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if field[0] != self.tree.objective_id: args.append("\"" + self.fields[field[0]]['slug'] + "\"") output = \ u"""#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import csv import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') class CSVInput(object): \"\"\"Reads and parses csv input from stdin Expects a data section (without headers) with the following fields: %s Data is processed to fall into the corresponding input type by applying INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can also provide strings to be considered as no content markers in MISSING_TOKENS. \"\"\" def __init__(self, input=sys.stdin): \"\"\" Opens stdin and defines parsing constants \"\"\" try: self.reader = csv.reader(input, delimiter=',', quotechar='\"') """ % ",".join(parameters) output += ( u"\n%sself.INPUT_FIELDS = [%s]\n" % ((INDENT * 3), (",\n " + INDENT * 8).join(args))) input_types = [] prefixes = [] suffixes = [] count = 0 fields = self.fields for key in [key[0] for key in input_fields if key != self.tree.objective_id]: input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else PYTHON_CONV[fields[key]['datatype']]) input_types.append(input_type) if 'prefix' in fields[key]: prefixes.append("%s: %s" % (count, repr(fields[key]['prefix']))) if 'suffix' in fields[key]: suffixes.append("%s: %s" % (count, repr(fields[key]['suffix']))) count += 1 static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(input_types), "]\n") static_content = "%sself.PREFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n") static_content = "%sself.SUFFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n") output += \ u""" self.MISSING_TOKENS = ['?'] except Exception, exc: sys.stderr.write(\"Cannot read csv\" \" input. %s\\n\" % str(exc)) def __iter__(self): \"\"\" Iterator method \"\"\" return self def next(self): \"\"\" Returns processed data in a list structure \"\"\" def normalize(value): \"\"\"Transforms to unicode and cleans missing tokens \"\"\" value = unicode(value.decode('utf-8')) return \"\" if value in self.MISSING_TOKENS else value def cast(function_value): \"\"\"Type related transformations \"\"\" function, value = function_value if not len(value): return None if function is None: return value else: return function(value) try: values = self.reader.next() except StopIteration: raise StopIteration() if len(values) < len(self.INPUT_FIELDS): sys.stderr.write(\"Found %s fields when %s were expected.\\n\" % (len(values), len(self.INPUT_FIELDS))) raise StopIteration() else: values = values[0:len(self.INPUT_FIELDS)] try: values = map(normalize, values) for key in self.PREFIXES: prefix_len = len(self.PREFIXES[key]) if values[key][0:prefix_len] == self.PREFIXES[key]: values[key] = values[key][prefix_len:] for key in self.SUFFIXES: suffix_len = len(self.SUFFIXES[key]) if values[key][-suffix_len:] == self.SUFFIXES[key]: values[key] = values[key][0:-suffix_len] function_tuples = zip(self.INPUT_TYPES, values) values = map(cast, function_tuples) data = {} for i in range(len(values)): data.update({self.INPUT_FIELDS[i]: values[i]}) return data except Exception, exc: sys.stderr.write(\"Error in data transformations. %s\\n\" % str(exc)) return False \n\n """ out.write(utf8(output)) out.flush() self.tree.python(out, self.docstring(), input_map=True, ids_path=ids_path, subtree=subtree) output = \ u""" csv = CSVInput() for values in csv: if not isinstance(values, bool): print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values))) \n\n """ % fields[self.tree.objective_id]['slug'] out.write(utf8(output)) out.flush()
def summarize(model, out=sys.stdout, format=BRIEF): """Prints summary grouping distribution as class header and details """ if model.boosting: raise AttributeError("This method is not available for boosting" " models.") tree = model.tree def extract_common_path(groups): """Extracts the common segment of the prediction path for a group """ for group in groups: details = groups[group]['details'] common_path = [] if len(details) > 0: mcd_len = min([len(x[0]) for x in details]) for i in range(0, mcd_len): test_common_path = details[0][0][i] for subgroup in details: if subgroup[0][i] != test_common_path: i = mcd_len break if i < mcd_len: common_path.append(test_common_path) groups[group]['total'][0] = common_path if len(details) > 0: groups[group]['details'] = sorted(details, key=lambda x: x[1], reverse=True) def confidence_error(value, impurity=None): """Returns confidence for categoric objective fields and error for numeric objective fields """ if value is None: return "" impurity_literal = "" if impurity is not None and impurity > 0: impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4)) objective_type = model.fields[model.objective_id]['optype'] if objective_type == 'numeric': return " [Error: %s]" % value return " [Confidence: %.2f%%%s]" % (round(value, 4) * 100, impurity_literal) distribution = get_data_distribution(model) out.write(utf8("Data distribution:\n")) print_distribution(distribution, out=out) out.write(utf8("\n\n")) groups = group_prediction(model) predictions = get_prediction_distribution(model, groups) out.write(utf8("Predicted distribution:\n")) print_distribution(predictions, out=out) out.write(utf8("\n\n")) if model.field_importance: out.write(utf8("Field importance:\n")) print_importance(model, out=out) extract_common_path(groups) out.write(utf8("\n\nRules summary:")) node = get_node(tree) count = node[model.offsets["count"]] for group in [x[0] for x in predictions]: details = groups[group]['details'] path = Path(groups[group]['total'][0]) data_per_group = groups[group]['total'][1] * 1.0 / count pred_per_group = groups[group]['total'][2] * 1.0 / count out.write( utf8("\n\n%s : (data %.2f%% / prediction %.2f%%) %s" % (group, round(data_per_group, 4) * 100, round(pred_per_group, 4) * 100, path.to_rules(model.fields, format=format)))) if len(details) == 0: out.write( utf8("\n The model will never predict this" " class\n")) elif len(details) == 1: subgroup = details[0] out.write( utf8("%s\n" % confidence_error(subgroup[2], impurity=subgroup[3]))) else: out.write(utf8("\n")) for subgroup in details: pred_per_sgroup = subgroup[1] * 1.0 / \ groups[group]['total'][2] path = Path(subgroup[0]) path_chain = path.to_rules(model.fields, format=format) if \ path.predicates else "(root node)" out.write( utf8( " · %.2f%%: %s%s\n" % (round(pred_per_sgroup, 4) * 100, path_chain, confidence_error(subgroup[2], impurity=subgroup[3])))) out.flush()
def hadoop_python_mapper(self, out=sys.stdout, ids_path=None, subtree=True): """Returns a hadoop mapper header to make predictions in python """ input_fields = [(value, key) for ( key, value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])] parameters = [ value for (key, value) in input_fields if key != self.tree.objective_id ] args = [] for field in input_fields: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if field[0] != self.tree.objective_id: args.append("\"" + self.fields[field[0]]['slug'] + "\"") output = \ u"""#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import csv import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') class CSVInput(object): \"\"\"Reads and parses csv input from stdin Expects a data section (without headers) with the following fields: %s Data is processed to fall into the corresponding input type by applying INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can also provide strings to be considered as no content markers in MISSING_TOKENS. \"\"\" def __init__(self, input=sys.stdin): \"\"\" Opens stdin and defines parsing constants \"\"\" try: self.reader = csv.reader(input, delimiter=',', quotechar='\"') """ % ",".join(parameters) output += (u"\n%sself.INPUT_FIELDS = [%s]\n" % ((INDENT * 3), (",\n " + INDENT * 8).join(args))) input_types = [] prefixes = [] suffixes = [] count = 0 fields = self.fields for key in [ key[0] for key in input_fields if key != self.tree.objective_id ]: input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else PYTHON_CONV[fields[key]['datatype']]) input_types.append(input_type) if 'prefix' in fields[key]: prefixes.append("%s: %s" % (count, repr(fields[key]['prefix']))) if 'suffix' in fields[key]: suffixes.append("%s: %s" % (count, repr(fields[key]['suffix']))) count += 1 static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(input_types), "]\n") static_content = "%sself.PREFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n") static_content = "%sself.SUFFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n") output += \ u""" self.MISSING_TOKENS = ['?'] except Exception, exc: sys.stderr.write(\"Cannot read csv\" \" input. %s\\n\" % str(exc)) def __iter__(self): \"\"\" Iterator method \"\"\" return self def next(self): \"\"\" Returns processed data in a list structure \"\"\" def normalize(value): \"\"\"Transforms to unicode and cleans missing tokens \"\"\" value = unicode(value.decode('utf-8')) return \"\" if value in self.MISSING_TOKENS else value def cast(function_value): \"\"\"Type related transformations \"\"\" function, value = function_value if not len(value): return None if function is None: return value else: return function(value) try: values = self.reader.next() except StopIteration: raise StopIteration() if len(values) < len(self.INPUT_FIELDS): sys.stderr.write(\"Found %s fields when %s were expected.\\n\" % (len(values), len(self.INPUT_FIELDS))) raise StopIteration() else: values = values[0:len(self.INPUT_FIELDS)] try: values = map(normalize, values) for key in self.PREFIXES: prefix_len = len(self.PREFIXES[key]) if values[key][0:prefix_len] == self.PREFIXES[key]: values[key] = values[key][prefix_len:] for key in self.SUFFIXES: suffix_len = len(self.SUFFIXES[key]) if values[key][-suffix_len:] == self.SUFFIXES[key]: values[key] = values[key][0:-suffix_len] function_tuples = zip(self.INPUT_TYPES, values) values = map(cast, function_tuples) data = {} for i in range(len(values)): data.update({self.INPUT_FIELDS[i]: values[i]}) return data except Exception, exc: sys.stderr.write(\"Error in data transformations. %s\\n\" % str(exc)) return False \n\n """ out.write(utf8(output)) out.flush() self.tree.python(out, self.docstring(), input_map=True, ids_path=ids_path, subtree=subtree) output = \ u""" csv = CSVInput() for values in csv: if not isinstance(values, bool): print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values))) \n\n """ % fields[self.tree.objective_id]['slug'] out.write(utf8(output)) out.flush()
def summarize(self, out=sys.stdout): """Prints summary grouping distribution as class header and details """ tree = self.tree def extract_common_path(groups): """Extracts the common segment of the prediction path for a group """ for group in groups: details = groups[group]['details'] common_path = [] if len(details) > 0: mcd_len = min([len(x[0]) for x in details]) for i in range(0, mcd_len): test_common_path = details[0][0][i] for subgroup in details: if subgroup[0][i] != test_common_path: i = mcd_len break if i < mcd_len: common_path.append(test_common_path) groups[group]['total'][0] = common_path if len(details) > 0: groups[group]['details'] = sorted(details, key=lambda x: x[1], reverse=True) def confidence_error(value): """Returns confidence for categoric objective fields and error for numeric objective fields """ if value is None: return "" objective_type = self.fields[tree.objective_id]['optype'] if objective_type == 'numeric': return u" [Error: %s]" % value else: return u" [Confidence: %.2f%%]" % (round(value, 4) * 100) distribution = self.get_data_distribution() out.write(u"Data distribution:\n") print_distribution(distribution, out=out) out.write(u"\n\n") groups = self.group_prediction() predictions = self.get_prediction_distribution(groups) out.write(u"Predicted distribution:\n") print_distribution(predictions, out=out) out.write(u"\n\n") if self.field_importance: out.write(u"Field importance:\n") print_importance(self, out=out) extract_common_path(groups) for group in [x[0] for x in predictions]: details = groups[group]['details'] path = [ prediction.to_rule(self.fields) for prediction in groups[group]['total'][0] ] data_per_group = groups[group]['total'][1] * 1.0 / tree.count pred_per_group = groups[group]['total'][2] * 1.0 / tree.count out.write( utf8(u"\n\n%s : (data %.2f%% / prediction %.2f%%) %s\n" % (group, round(data_per_group, 4) * 100, round(pred_per_group, 4) * 100, " and ".join(path)))) if len(details) == 0: out.write(u" The model will never predict this class\n") for j in range(0, len(details)): subgroup = details[j] pred_per_sgroup = subgroup[1] * 1.0 / groups[group]['total'][2] path = [ prediction.to_rule(self.fields) for prediction in subgroup[0] ] path_chain = " and ".join(path) if len(path) else "(root node)" out.write( utf8(u" · %.2f%%: %s%s\n" % (round(pred_per_sgroup, 4) * 100, path_chain, confidence_error(subgroup[2])))) out.flush()
def hadoop_python_mapper(model, out=sys.stdout, ids_path=None, subtree=True): """Generates a hadoop mapper header to make predictions in python """ input_fields = [(value, key) for (key, value) in sorted( list(model.inverted_fields.items()), key=lambda x: x[1])] parameters = [ value for (key, value) in input_fields if key != model.objective_id ] args = [] for field in input_fields: slug = slugify(model.fields[field[0]]['name']) model.fields[field[0]].update(slug=slug) if field[0] != model.objective_id: args.append("\"" + model.fields[field[0]]['slug'] + "\"") with open(HADOOP_CSV_TEMPLATE) as template_handler: output = template_handler.read() % ",".join(parameters) output += "\n%sself.INPUT_FIELDS = [%s]\n" % \ ((INDENT * 3), (",\n " + INDENT * 8).join(args)) input_types = [] prefixes = [] suffixes = [] count = 0 fields = model.fields for key in [ field[0] for field in input_fields if field[0] != model.objective_id ]: input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else PYTHON_CONV[fields[key]['datatype']]) input_types.append(input_type) if 'prefix' in fields[key]: prefixes.append("%s: %s" % (count, repr(fields[key]['prefix']))) if 'suffix' in fields[key]: suffixes.append("%s: %s" % (count, repr(fields[key]['suffix']))) count += 1 static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += "\n%s%s%s" % (static_content, formatter.join(input_types), "]\n") static_content = "%sself.PREFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += "\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n") static_content = "%sself.SUFFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += "\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n") with open(HADOOP_NEXT_TEMPLATE) as template_handler: output += template_handler.read() out.write(output) out.flush() tree_python(model.tree, model.offsets, model.fields, model.objective_id, False if not hasattr(model, "boosting") else model.boosting, out, docstring(model), ids_path=ids_path, subtree=subtree) output = \ """ csv = CSVInput() for values in csv: if not isinstance(values, bool): print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values))) \n\n """ % fields[model.objective_id]['slug'] out.write(utf8(output)) out.flush()
def summarize(self, out=sys.stdout): """Prints summary grouping distribution as class header and details """ tree = self.tree def extract_common_path(groups): """Extracts the common segment of the prediction path for a group """ for group in groups: details = groups[group]['details'] common_path = [] if len(details) > 0: mcd_len = min([len(x[0]) for x in details]) for i in range(0, mcd_len): test_common_path = details[0][0][i] for subgroup in details: if subgroup[0][i] != test_common_path: i = mcd_len break if i < mcd_len: common_path.append(test_common_path) groups[group]['total'][0] = common_path if len(details) > 0: groups[group]['details'] = sorted(details, key=lambda x: x[1], reverse=True) def confidence_error(value, impurity=None): """Returns confidence for categoric objective fields and error for numeric objective fields """ if value is None: return "" impurity_literal = "" if impurity is not None and impurity > 0: impurity_literal = "; impurity: %.2f%%" % (round(impurity, 4)) objective_type = self.fields[tree.objective_id]['optype'] if objective_type == 'numeric': return u" [Error: %s]" % value else: return u" [Confidence: %.2f%%%s]" % ((round(value, 4) * 100), impurity_literal) distribution = self.get_data_distribution() out.write(u"Data distribution:\n") print_distribution(distribution, out=out) out.write(u"\n\n") groups = self.group_prediction() predictions = self.get_prediction_distribution(groups) out.write(u"Predicted distribution:\n") print_distribution(predictions, out=out) out.write(u"\n\n") if self.field_importance: out.write(u"Field importance:\n") print_importance(self, out=out) extract_common_path(groups) for group in [x[0] for x in predictions]: details = groups[group]['details'] path = [prediction.to_rule(self.fields) for prediction in groups[group]['total'][0]] data_per_group = groups[group]['total'][1] * 1.0 / tree.count pred_per_group = groups[group]['total'][2] * 1.0 / tree.count out.write(utf8(u"\n\n%s : (data %.2f%% / prediction %.2f%%) %s\n" % (group, round(data_per_group, 4) * 100, round(pred_per_group, 4) * 100, " and ".join(path)))) if len(details) == 0: out.write(u" The model will never predict this class\n") for j in range(0, len(details)): subgroup = details[j] pred_per_sgroup = subgroup[1] * 1.0 / groups[group]['total'][2] path = [prediction.to_rule(self.fields) for prediction in subgroup[0]] path_chain = " and ".join(path) if len(path) else "(root node)" out.write(utf8(u" · %.2f%%: %s%s\n" % (round(pred_per_sgroup, 4) * 100, path_chain, confidence_error(subgroup[2], impurity=subgroup[3])))) out.flush()
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, add_min=False, add_max=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree add_min: Boolean, if True adds the minimum value in the prediction's distribution (for regressions only) add_max: Boolean, if True adds the maximum value in the prediction's distribution (for regressions only) multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write(utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances} output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next or add_min or add_max): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update( {'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit}) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) if self.tree.regression and add_min: output.update({'min': prediction.min}) if self.tree.regression and add_max: output.update({'max': prediction.max}) return output
def predict(self, input_data, by_name=True, print_path=False, out=sys.stdout, with_confidence=False, missing_strategy=LAST_PREDICTION, add_confidence=False, add_path=False, add_distribution=False, add_count=False, add_median=False, add_next=False, multiple=None): """Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, True if input_data is keyed by names print_path: Boolean, if True the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if True, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if True adds confidence to the dict output add_path: Boolean, if True adds path to the dict output add_distribution: Boolean, if True adds distribution info to the dict output add_count: Boolean, if True adds the number of instances in the node to the dict output add_median: Boolean, if True adds the median of the values in the distribution add_next: Boolean, if True adds the field that determines next split in the tree multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of dicts: [{'prediction': 'Iris-setosa', 'confidence': 0.9154 'probability': 0.97 'count': 97}, {'prediction': 'Iris-virginica', 'confidence': 0.0103 'probability': 0.03, 'count': 3}] The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. """ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy if (self.tree.regression and missing_strategy == PROPORTIONAL and not self.regression_ready): raise ImportError("Failed to find the numpy and scipy libraries," " needed to use proportional missing strategy" " for regressions. Please install them before" " using local predictions for the model.") # Checks and cleans input_data leaving the fields used in the model input_data = self.filter_input_data(input_data, by_name=by_name) # Strips affixes for numeric values and casts to the final field type cast(input_data, self.fields) prediction = self.tree.predict(input_data, missing_strategy=missing_strategy) # Prediction path if print_path: out.write( utf8(u' AND '.join(prediction.path) + u' => %s \n' % prediction.output)) out.flush() output = prediction.output if with_confidence: output = [ prediction.output, prediction.confidence, prediction.distribution, prediction.count, prediction.median ] if multiple is not None and not self.tree.regression: output = [] total_instances = float(prediction.count) distribution = enumerate(prediction.distribution) for index, [category, instances] in distribution: if ((isinstance(multiple, basestring) and multiple == 'all') or (isinstance(multiple, int) and index < multiple)): prediction_dict = { 'prediction': category, 'confidence': ws_confidence(category, prediction.distribution), 'probability': instances / total_instances, 'count': instances } output.append(prediction_dict) else: if (add_confidence or add_path or add_distribution or add_count or add_median or add_next): output = {'prediction': prediction.output} if add_confidence: output.update({'confidence': prediction.confidence}) if add_path: output.update({'path': prediction.path}) if add_distribution: output.update({ 'distribution': prediction.distribution, 'distribution_unit': prediction.distribution_unit }) if add_count: output.update({'count': prediction.count}) if self.tree.regression and add_median: output.update({'median': prediction.median}) if add_next: field = (None if len(prediction.children) == 0 else prediction.children[0].predicate.field) if field is not None and field in self.fields: field = self.fields[field]['name'] output.update({'next': field}) return output
def tree_python(tree, offsets, fields, objective_id, boosting, out, docstring_str, input_map=False, ids_path=None, subtree=True): """Writes a python function that implements the model. """ args = [] args_tree = [] parameters = sort_fields(fields) if not input_map: input_map = len(parameters) > MAX_ARGS_LENGTH reserved_keywords = keyword.kwlist if not input_map else None prefix = "_" if not input_map else "" for field in parameters: field_name_to_show = fields[field[0]]['name'].strip() if field_name_to_show == "": field_name_to_show = field[0] slug = slugify(field_name_to_show, reserved_keywords=reserved_keywords, prefix=prefix) fields[field[0]].update(slug=slug) if not input_map: if field[0] != objective_id: args.append("%s=None" % (slug)) args_tree.append("%s=%s" % (slug, slug)) if input_map: args.append("data={}") args_tree.append("data=data") function_name = fields[objective_id]['slug'] if \ not boosting else fields[boosting["objective_field"]]['slug'] if prefix == "_" and function_name[0] == prefix: function_name = function_name[1:] if function_name == "": function_name = "field_" + objective_id python_header = "# -*- coding: utf-8 -*-\n" predictor_definition = ("def predict_%s" % function_name) depth = len(predictor_definition) + 1 predictor = "%s(%s):\n" % (predictor_definition, (",\n" + " " * depth).join(args)) predictor_doc = (INDENT + "\"\"\" " + docstring_str + "\n" + INDENT + "\"\"\"\n") body_fn = boosted_plug_in_body if boosting else plug_in_body body, term_analysis_predicates, item_analysis_predicates = \ body_fn(tree, offsets, fields, objective_id, fields[objective_id]["optype"] == NUMERIC, input_map=input_map, ids_path=ids_path, subtree=subtree) terms_body = "" if term_analysis_predicates or item_analysis_predicates: terms_body = term_analysis_body(fields, term_analysis_predicates, item_analysis_predicates) predictor = python_header + predictor + \ predictor_doc + terms_body + body predictor_model = "def predict" depth = len(predictor_model) + 1 predictor += "\n\n%s(%s):\n" % (predictor_model, (",\n" + " " * depth).join(args)) predictor += "%sprediction = predict_%s(%s)\n" % ( \ INDENT, function_name, ", ".join(args_tree)) if boosting is not None: predictor += "%sprediction.update({\"weight\": %s})\n" % \ (INDENT, boosting.get("weight")) if boosting.get("objective_class") is not None: predictor += "%sprediction.update({\"class\": \"%s\"})\n" % \ (INDENT, boosting.get("objective_class")) predictor += "%sreturn prediction" % INDENT out.write(utf8(predictor)) out.flush()