def i_check_topic_distributions(step, check_file): check_file = res_filename(check_file) predictions_file = world.output import traceback try: with UnicodeReader(predictions_file) as predictions_file: with UnicodeReader(check_file) as check_file: for row in predictions_file: check_row = check_file.next() assert len(check_row) == len(row) for index in range(len(row)): dot = row[index].find(".") decimal_places = 1 if dot > 0 or (check_row[index].find(".") > 0 and check_row[index].endswith(".0")): try: decimal_places = min( \ len(row[index]), len(check_row[index])) - dot - 1 row[index] = round(float(row[index]), decimal_places) check_row[index] = round( float(check_row[index]), decimal_places) except ValueError: decimal_places = 1 assert_almost_equal(check_row[index], row[index], places=(decimal_places - 1)) else: assert_equal(check_row[index], row[index]) except Exception, exc: assert False, traceback.format_exc()
def i_check_forecasts(step, check_file): check_file = res_filename(check_file) forecasts_file = "%s_%s.csv" % \ (world.output, world.time_series["object"]["objective_field"]) import traceback try: with UnicodeReader(forecasts_file) as forecasts_file: with UnicodeReader(check_file) as check_file: for row in forecasts_file: check_row = check_file.next() assert_equal(len(check_row), len(row)) for index in range(len(row)): dot = row[index].find(".") decimal_places = 1 if dot > 0 or (check_row[index].find(".") > 0 and check_row[index].endswith(".0")): try: decimal_places = min(len(row[index]), len(check_row[index])) \ - dot - 1 row[index] = round(float(row[index]), decimal_places) check_row[index] = round(float(check_row[ \ index]), decimal_places) except ValueError: decimal_places = 1 assert_almost_equal(check_row[index], row[index], places=(decimal_places - 1)) else: assert_equal(check_row[index], row[index]) except Exception, exc: assert False, traceback.format_exc()
def check_summary_like_expected(step, summary_file, expected_file): summary_contents = [] expected_contents = [] with UnicodeReader(res_filename(summary_file)) as summary_handler: for line in summary_handler: summary_contents.append(line) with UnicodeReader(res_filename(expected_file)) as expected_handler: for line in expected_handler: expected_contents.append(line) eq_(summary_contents, expected_contents)
def read_field_attributes(path): """Reads field attributes from a csv file to update source fields. A column number and a list of attributes separated by a comma per line. The expected structure is: column number, name, label, description For example: 0,'first name','label for the first field','fist field full description' 1,'last name','label for the last field','last field full description' """ field_attributes = {} try: with UnicodeReader(path, quotechar="'") as attributes_reader: for row in attributes_reader: attributes = {} if len(row) > 1: for index in range(0, min(len(ATTRIBUTE_NAMES), len(row) - 1)): attributes.update( {ATTRIBUTE_NAMES[index]: row[index + 1]}) field_attributes.update({int(row[0]): attributes}) return field_attributes except IOError: sys.exit("Error: cannot read field attributes %s" % path)
def read_objective_weights(path): """Reads objective weights from a CSV file in a class, weight format. The expected structure is: class name, weight For example: Iris-setosa,5 Iris-versicolor,10 """ objective_weights = [] import traceback try: with UnicodeReader(path, quotechar="'") as weights_reader: for row in weights_reader: weights = [] if len(row) != 2: sys.exit("Error: wrong objective field file syntax\n%s" % ",".join(row)) weights = row[:] try: weights[1] = int(weights[1]) except ValueError: sys.exit("Error: wrong objective field file syntax\n%s" % ",".join(row)) objective_weights.append(weights) return objective_weights except IOError: sys.exit("Error: cannot read objective weights %s" % path)
def reset(self): """Starts a new csv reader object """ try: self.training_set.close() except (IOError, AttributeError): pass try: self.training_reader = UnicodeReader( self.training_set, delimiter=self.training_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read training %s" % self.training_set)
def read_votes(votes_files, to_prediction, data_locale=None): """Reads the votes found in the votes' files. Returns a list of MultiVote objects containing the list of predictions. votes_files parameter should contain the path to the files where votes are stored In to_prediction parameter we expect the method of a local model object that casts the string prediction values read from the file to their real type. For instance >>> local_model = Model(model) >>> prediction = local_model.to_prediction("1") >>> isinstance(prediction, int) True >>> read_votes(["my_predictions_file"], local_model.to_prediction) data_locale should contain the string identification for the locale used in numeric formatting. """ votes = [] for order in range(0, len(votes_files)): votes_file = votes_files[order] index = 0 with UnicodeReader(votes_file) as rdr: for row in rdr: prediction = to_prediction(row[0], data_locale=data_locale) if index > (len(votes) - 1): votes.append(MultiVote([])) distribution = None instances = None if len(row) > 2: distribution = ast.literal_eval(row[2]) instances = int(row[3]) try: confidence = float(row[1]) except ValueError: confidence = 0.0 prediction_row = [ prediction, confidence, order, distribution, instances ] votes[index].append_row(prediction_row) index += 1 return votes
def i_check_predictions(step, check_file): with UnicodeReader(world.output) as prediction_rows: with UnicodeReader(res_filename(check_file)) as test_rows: check_rows(prediction_rows, test_rows)
def new_fields_structure(self, csv_attributes_file=None, attributes=None, out_file=None): """Builds the field structure needed to update a fields dictionary in a BigML resource. :param csv_attributes_file: (string) Path to a CSV file like the one generated by summary_csv. :param attributes: (list) list of rows containing the attributes information ordered as in the summary_csv output. :param out_file: (string) Path to a JSON file that will be used to store the new fields structure. If None, the output is returned as a dict. """ if csv_attributes_file is not None: reader = UnicodeReader(csv_attributes_file).open_reader() attributes = [row for row in reader] new_fields_structure = {} if "field ID" in attributes[0] or "field column" in attributes[0]: # headers are used for index in range(1, len(attributes)): new_attributes = dict(zip(attributes[0], attributes[index])) if new_attributes.get("field ID"): field_id = new_attributes.get("field ID") if not field_id in self.fields.keys(): raise ValueError("Field ID %s not found" " in this resource" % field_id) del new_attributes["field ID"] else: field_column = int(new_attributes.get("field column")) if not field_column in self.field_columns: raise ValueError("Field column %s not found" " in this resource" % field_column) field_id = self.field_id(field_column) del new_attributes["field column"] for attribute, value in new_attributes.items(): if not attribute in UPDATABLE_HEADERS.keys(): del new_attributes[attribute] else: new_attributes[UPDATABLE_HEADERS[attribute]] = \ new_attributes[attribute] if attribute != UPDATABLE_HEADERS[attribute]: del new_attributes[attribute] if "preferred" in new_attributes: new_attributes['preferred'] = json.loads( \ new_attributes['preferred']) new_fields_structure[field_id] = new_attributes else: # assume the order given in the summary_csv method first_attribute = attributes[0][0] first_column_is_id = False try: field_id = self.field_id(int(first_attribute)) except ValueError: field_id = first_attribute first_column_is_id = True if not field_id in self.fields: raise ValueError("The first column should contain either the" " column or ID of the fields. Failed to find" " %s as either of them." % field_id) headers = SUMMARY_HEADERS[2:7] headers = [UPDATABLE_HEADERS[header] for header in headers] try: for field_attributes in attributes: if field_attributes[6] is not None: field_attributes[6] = json.loads(field_attributes[6]) field_id = field_attributes[0] if first_column_is_id else \ self.field_id(int(field_attributes[0])) new_fields_structure[field_id] = \ dict(zip(headers, field_attributes[1: 6])) except ValueError: raise ValueError("The first column should contain either the" " column or ID of the fields. Failed to find" " %s as either of them." % field_id) if out_file is None: return {"fields": new_fields_structure} else: try: with open(out_file, "w") as out: json.dump({"fields": new_fields_structure}, out) except IOError, exc: raise IOError("Failed writing the fields structure file in" " %s- Please, check your arguments." % out_file)
class TstReader(object): """Retrieves csv info and builds a input data dict """ def __init__(self, test_set, test_set_header, fields, objective_field, test_separator=None): """Builds a generator from a csv file and the fields' model structure `test_set`: path to the test data file `test_set_header`: boolean, True means that headers are first row in the file `fields`: Fields object with the expected fields structure. `objective_field`: field_id of the objective field """ self.test_set = test_set if test_set.__class__.__name__ == "StringIO": self.encode = None self.test_set = UTF8Recoder(test_set, SYSTEM_ENCODING) else: self.encode = None if PYTHON3 else FILE_ENCODING self.test_set_header = test_set_header self.fields = fields if (objective_field is not None and not objective_field in fields.fields): try: objective_field = fields.field_id(objective_field) except ValueError, exc: sys.exit(exc) self.objective_field = objective_field if test_separator and not PYTHON3: test_separator = decode2(test_separator, encoding="string_escape") self.test_separator = (test_separator if test_separator is not None else get_csv_delimiter()) if len(self.test_separator) > 1: sys.exit("Only one character can be used as test data separator.") try: self.test_reader = UnicodeReader( self.test_set, delimiter=self.test_separator, lineterminator="\n").open_reader() except IOError: sys.exit("Error: cannot read test %s" % test_set) self.headers = None self.raw_headers = None self.exclude = [] if test_set_header: self.headers = self.test_reader.next() # validate headers against model fields excluding objective_field, # that may be present or not if objective_field is not None: objective_field = fields.field_column_number(objective_field) try: fields_names = [ fields.fields[fields.field_id(i)]['name'] for i in sorted(fields.fields_by_column_number.keys()) if objective_field is None or i != objective_field ] except ValueError, exc: sys.exit(exc) self.raw_headers = self.headers[:] self.exclude = [ i for i in range(len(self.headers)) if not self.headers[i] in fields_names ] self.exclude.reverse() if self.exclude: if len(self.headers) > len(self.exclude): for index in self.exclude: del self.headers[index] else: raise Exception( (u"No test field matches the model fields." u"\nThe expected fields are:\n\n%s\n\n" u"while " u"the headers found in the test file are:" u"\n\n%s\n\n" u"Use --no-test-header flag if first li" u"ne should not be interpreted as" u" headers." % (",".join(fields_names), ",".join( self.headers))).encode("utf-8"))