def python(self, out, docstring, input_map=False): """Writes a python function that implements the model. """ args = [] parameters = sort_fields(self.fields) if not input_map: input_map = len(parameters) > MAX_ARGS_LENGTH for field in [(key, val) for key, val in parameters]: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if not input_map: if field[0] != self.objective_field: args.append("%s=None" % (slug)) if input_map: args.append("data={}") predictor_definition = (u"def predict_%s" % self.fields[self.objective_field]['slug']) depth = len(predictor_definition) + 1 predictor = u"%s(%s):\n" % (predictor_definition, (",\n" + " " * depth).join(args)) predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT + u"\"\"\"\n") predictor += predictor_doc + self.python_body(input_map=input_map) out.write(utf8(predictor)) out.flush()
def python(self, out, docstring, input_map=False): """Writes a python function that implements the model. """ args = [] parameters = sort_fields(self.fields) if not input_map: input_map = len(parameters) > MAX_ARGS_LENGTH reserved_keywords = keyword.kwlist if not input_map else None prefix = "_" if not input_map else "" for field in [(key, val) for key, val in parameters]: slug = slugify(self.fields[field[0]]['name'], reserved_keywords=reserved_keywords, prefix=prefix) self.fields[field[0]].update(slug=slug) if not input_map: if field[0] != self.objective_field: args.append("%s=None" % (slug)) if input_map: args.append("data={}") predictor_definition = (u"def predict_%s" % self.fields[self.objective_field]['slug']) depth = len(predictor_definition) + 1 predictor = u"%s(%s):\n" % (predictor_definition, (",\n" + " " * depth).join(args)) predictor_doc = (INDENT + u"\"\"\" " + docstring + u"\n" + INDENT + u"\"\"\"\n") body, term_analysis_predicates = self.python_body(input_map=input_map) terms_body = "" if term_analysis_predicates: terms_body = self.term_analysis_body(term_analysis_predicates) predictor += predictor_doc + terms_body + body out.write(utf8(predictor)) out.flush()
def rules(self, out): """Prints out an IF-THEN rule version of the tree. """ for field in [(key, val) for key, val in sort_fields(self.fields)]: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) out.write(utf8(self.generate_rules())) out.flush()
def evaluate(models_or_ensembles, datasets, api, args, resume, session_file=None, path=None, log=None, fields=None, dataset_fields=None, labels=None, all_labels=None, objective_field=None): """Evaluates a list of models or ensembles with the given dataset """ output = args.predictions evaluation_files = [] evaluations, resume = evaluations_process( models_or_ensembles, datasets, fields, dataset_fields, api, args, resume, session_file=session_file, path=path, log=log, labels=labels, all_labels=all_labels, objective_field=objective_field) if hasattr(args, 'multi_label') and args.multi_label: file_labels = [slugify(name) for name in u.objective_field_names(models_or_ensembles, api)] for index in range(0, len(evaluations)): evaluation = evaluations[index] evaluation = r.get_evaluation(evaluation, api, args.verbosity, session_file) if r.shared_changed(args.shared, evaluation): evaluation_args = {"shared": args.shared} evaluation = r.update_evaluation(evaluation, evaluation_args, args, api=api, path=path, session_file=session_file) file_name = output if hasattr(args, 'multi_label') and args.multi_label: suffix = file_labels[index] file_name += "_%s" % suffix evaluation_files.append("%s.json" % file_name) if args.test_datasets or args.dataset_off: suffix = evaluation['resource'].replace('evaluation/', '_') file_name += "_%s" % suffix evaluation_files.append("%s.json" % file_name) r.save_evaluation(evaluation, file_name, api) if (hasattr(args, 'multi_label') and args.multi_label) or \ args.test_datasets or args.dataset_off: mean_evaluation = average_evaluations(evaluation_files) r.save_evaluation(mean_evaluation, output, api) return resume
def python(self, out): """Writes a python function that implements the model. """ args = [] for field in [(key, val) for key, val in sorted(self.fields.items(), key=lambda k: k[1]['column_number'])]: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) default = None if self.fields[field[0]]['optype'] == 'numeric': default = self.fields[field[0]]['summary']['median'] if field[0] != self.objective_field: args.append("%s=%s" % (slug, default)) predictor = "def predict_%s(%s):\n" % ( self.fields[self.objective_field]['slug'], ", ".join(args)) predictor += self.python_body() out.write(predictor) out.flush()
def hadoop_python_mapper(self, out=sys.stdout, ids_path=None, subtree=True): """Returns a hadoop mapper header to make predictions in python """ input_fields = [(value, key) for ( key, value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])] parameters = [ value for (key, value) in input_fields if key != self.tree.objective_id ] args = [] for field in input_fields: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if field[0] != self.tree.objective_id: args.append("\"" + self.fields[field[0]]['slug'] + "\"") output = \ u"""#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import csv import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') class CSVInput(object): \"\"\"Reads and parses csv input from stdin Expects a data section (without headers) with the following fields: %s Data is processed to fall into the corresponding input type by applying INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can also provide strings to be considered as no content markers in MISSING_TOKENS. \"\"\" def __init__(self, input=sys.stdin): \"\"\" Opens stdin and defines parsing constants \"\"\" try: self.reader = csv.reader(input, delimiter=',', quotechar='\"') """ % ",".join(parameters) output += (u"\n%sself.INPUT_FIELDS = [%s]\n" % ((INDENT * 3), (",\n " + INDENT * 8).join(args))) input_types = [] prefixes = [] suffixes = [] count = 0 fields = self.fields for key in [ key[0] for key in input_fields if key != self.tree.objective_id ]: input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else PYTHON_CONV[fields[key]['datatype']]) input_types.append(input_type) if 'prefix' in fields[key]: prefixes.append("%s: %s" % (count, repr(fields[key]['prefix']))) if 'suffix' in fields[key]: suffixes.append("%s: %s" % (count, repr(fields[key]['suffix']))) count += 1 static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(input_types), "]\n") static_content = "%sself.PREFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n") static_content = "%sself.SUFFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n") output += \ u""" self.MISSING_TOKENS = ['?'] except Exception, exc: sys.stderr.write(\"Cannot read csv\" \" input. %s\\n\" % str(exc)) def __iter__(self): \"\"\" Iterator method \"\"\" return self def next(self): \"\"\" Returns processed data in a list structure \"\"\" def normalize(value): \"\"\"Transforms to unicode and cleans missing tokens \"\"\" value = unicode(value.decode('utf-8')) return \"\" if value in self.MISSING_TOKENS else value def cast(function_value): \"\"\"Type related transformations \"\"\" function, value = function_value if not len(value): return None if function is None: return value else: return function(value) try: values = self.reader.next() except StopIteration: raise StopIteration() if len(values) < len(self.INPUT_FIELDS): sys.stderr.write(\"Found %s fields when %s were expected.\\n\" % (len(values), len(self.INPUT_FIELDS))) raise StopIteration() else: values = values[0:len(self.INPUT_FIELDS)] try: values = map(normalize, values) for key in self.PREFIXES: prefix_len = len(self.PREFIXES[key]) if values[key][0:prefix_len] == self.PREFIXES[key]: values[key] = values[key][prefix_len:] for key in self.SUFFIXES: suffix_len = len(self.SUFFIXES[key]) if values[key][-suffix_len:] == self.SUFFIXES[key]: values[key] = values[key][0:-suffix_len] function_tuples = zip(self.INPUT_TYPES, values) values = map(cast, function_tuples) data = {} for i in range(len(values)): data.update({self.INPUT_FIELDS[i]: values[i]}) return data except Exception, exc: sys.stderr.write(\"Error in data transformations. %s\\n\" % str(exc)) return False \n\n """ out.write(utf8(output)) out.flush() self.tree.python(out, self.docstring(), input_map=True, ids_path=ids_path, subtree=subtree) output = \ u""" csv = CSVInput() for values in csv: if not isinstance(values, bool): print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values))) \n\n """ % fields[self.tree.objective_id]['slug'] out.write(utf8(output)) out.flush()
def hadoop_python_mapper(self, out=sys.stdout, ids_path=None, subtree=True): """Returns a hadoop mapper header to make predictions in python """ input_fields = [(value, key) for (key, value) in sorted(self.inverted_fields.items(), key=lambda x: x[1])] parameters = [value for (key, value) in input_fields if key != self.tree.objective_id] args = [] for field in input_fields: slug = slugify(self.fields[field[0]]['name']) self.fields[field[0]].update(slug=slug) if field[0] != self.tree.objective_id: args.append("\"" + self.fields[field[0]]['slug'] + "\"") output = \ u"""#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import csv import locale locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') class CSVInput(object): \"\"\"Reads and parses csv input from stdin Expects a data section (without headers) with the following fields: %s Data is processed to fall into the corresponding input type by applying INPUT_TYPES, and per field PREFIXES and SUFFIXES are removed. You can also provide strings to be considered as no content markers in MISSING_TOKENS. \"\"\" def __init__(self, input=sys.stdin): \"\"\" Opens stdin and defines parsing constants \"\"\" try: self.reader = csv.reader(input, delimiter=',', quotechar='\"') """ % ",".join(parameters) output += ( u"\n%sself.INPUT_FIELDS = [%s]\n" % ((INDENT * 3), (",\n " + INDENT * 8).join(args))) input_types = [] prefixes = [] suffixes = [] count = 0 fields = self.fields for key in [key[0] for key in input_fields if key != self.tree.objective_id]: input_type = ('None' if not fields[key]['datatype'] in PYTHON_CONV else PYTHON_CONV[fields[key]['datatype']]) input_types.append(input_type) if 'prefix' in fields[key]: prefixes.append("%s: %s" % (count, repr(fields[key]['prefix']))) if 'suffix' in fields[key]: suffixes.append("%s: %s" % (count, repr(fields[key]['suffix']))) count += 1 static_content = "%sself.INPUT_TYPES = [" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(input_types), "]\n") static_content = "%sself.PREFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(prefixes), "}\n") static_content = "%sself.SUFFIXES = {" % (INDENT * 3) formatter = ",\n%s" % (" " * len(static_content)) output += u"\n%s%s%s" % (static_content, formatter.join(suffixes), "}\n") output += \ u""" self.MISSING_TOKENS = ['?'] except Exception, exc: sys.stderr.write(\"Cannot read csv\" \" input. %s\\n\" % str(exc)) def __iter__(self): \"\"\" Iterator method \"\"\" return self def next(self): \"\"\" Returns processed data in a list structure \"\"\" def normalize(value): \"\"\"Transforms to unicode and cleans missing tokens \"\"\" value = unicode(value.decode('utf-8')) return \"\" if value in self.MISSING_TOKENS else value def cast(function_value): \"\"\"Type related transformations \"\"\" function, value = function_value if not len(value): return None if function is None: return value else: return function(value) try: values = self.reader.next() except StopIteration: raise StopIteration() if len(values) < len(self.INPUT_FIELDS): sys.stderr.write(\"Found %s fields when %s were expected.\\n\" % (len(values), len(self.INPUT_FIELDS))) raise StopIteration() else: values = values[0:len(self.INPUT_FIELDS)] try: values = map(normalize, values) for key in self.PREFIXES: prefix_len = len(self.PREFIXES[key]) if values[key][0:prefix_len] == self.PREFIXES[key]: values[key] = values[key][prefix_len:] for key in self.SUFFIXES: suffix_len = len(self.SUFFIXES[key]) if values[key][-suffix_len:] == self.SUFFIXES[key]: values[key] = values[key][0:-suffix_len] function_tuples = zip(self.INPUT_TYPES, values) values = map(cast, function_tuples) data = {} for i in range(len(values)): data.update({self.INPUT_FIELDS[i]: values[i]}) return data except Exception, exc: sys.stderr.write(\"Error in data transformations. %s\\n\" % str(exc)) return False \n\n """ out.write(utf8(output)) out.flush() self.tree.python(out, self.docstring(), input_map=True, ids_path=ids_path, subtree=subtree) output = \ u""" csv = CSVInput() for values in csv: if not isinstance(values, bool): print u'%%s\\t%%s' %% (repr(values), repr(predict_%s(values))) \n\n """ % fields[self.tree.objective_id]['slug'] out.write(utf8(output)) out.flush()