def load(self, fpath): from exprparser import parse with open(os.path.join(config.input_directory, fpath), "rb") as f: reader = csv.reader(f) lines = skip_comment_cells(strip_rows(reader)) header = lines.next() self.expressions = [parse(s, autovariables=True) for s in header] table = [] for line in lines: if any(value == "" for value in line): raise Exception("empty cell found in %s" % fpath) table.append([eval(value) for value in line]) ndim = len(header) unique_last_d, dupe_last_d = unique_duplicate(table.pop(0)) if dupe_last_d: print( "Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)) ) raise Exception( "bad alignment data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d)) ) # strip the ndim-1 first columns headers = [[line.pop(0) for line in table] for _ in range(ndim - 1)] possible_values = [list(unique(values)) for values in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print("Duplicate row header value(s) in '%s':" % fpath) print(PrettyTable(dupe_combos)) raise Exception( "bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos)) ) possible_values.append(unique_last_d) self.possible_values = possible_values self.probabilities = list(chain.from_iterable(table)) num_possible_values = prod(len(values) for values in possible_values) if len(self.probabilities) != num_possible_values: raise Exception( "incoherent alignment data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % ( fpath, len(self.probabilities), num_possible_values, " * ".join(str(len(values)) for values in possible_values), ) )
def load_ndarray(fpath, celltype=None): print(" - reading", fpath) with open(fpath, "rb") as f: reader = csv.reader(f) line_stream = skip_comment_cells(strip_rows(reader)) header = line_stream.next() str_table = [] for line in line_stream: if any(value == '' for value in line): raise Exception("empty cell found in %s" % fpath) str_table.append(line) ndim = len(header) # handle last dimension header (horizontal values) last_d_header = str_table.pop(0) # auto-detect type of values for the last d and convert them last_d_pvalues = convert_1darray(last_d_header) unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues) if dupe_last_d: print(("Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)))) raise Exception("bad data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))) # handle other dimensions header # strip the ndim-1 first columns headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)] headers = [convert_1darray(pvalues_str) for pvalues_str in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print(("Duplicate row header value(s) in '%s':" % fpath)) print((PrettyTable(dupe_combos))) raise Exception("bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))) possible_values = [np.array(list(unique(pvalues))) for pvalues in headers] possible_values.append(np.array(unique_last_d)) shape = tuple(len(values) for values in possible_values) num_possible_values = prod(shape) # transform the 2d table into a 1d list str_table = list(chain.from_iterable(str_table)) if len(str_table) != num_possible_values: raise Exception("incoherent data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % (fpath, len(str_table), num_possible_values, ' * '.join(str(len(values)) for values in possible_values))) #TODO: compare time with numpy built-in conversion: # if dtype is None, numpy tries to detect the best type itself # which it does a good job of if the values are already numeric values # if dtype is provided, numpy does a good job to convert from string # values. if celltype is None: celltype = detect_column_type(str_table) data = convert_1darray(str_table, celltype) array = np.array(data, dtype=celltype) return LabeledArray(array.reshape(shape), header, possible_values)
def load_ndarray(fpath, celltype=None): print(" - reading", fpath) with open(fpath, "rb") as f: reader = csv.reader(f) line_stream = skip_comment_cells(strip_rows(reader)) header = line_stream.next() str_table = [] for line in line_stream: if any(value == '' for value in line): raise Exception("empty cell found in %s" % fpath) str_table.append(line) ndim = len(header) # handle last dimension header (horizontal values) last_d_header = str_table.pop(0) # auto-detect type of values for the last d and convert them last_d_pvalues = convert_1darray(last_d_header) unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues) if dupe_last_d: print(("Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)))) raise Exception("bad data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))) # handle other dimensions header # strip the ndim-1 first columns headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)] headers = [convert_1darray(pvalues_str) for pvalues_str in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print(("Duplicate row header value(s) in '%s':" % fpath)) print((PrettyTable(dupe_combos))) raise Exception("bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))) possible_values = [np.array(list(unique(pvalues))) for pvalues in headers] possible_values.append(np.array(unique_last_d)) shape = tuple(len(values) for values in possible_values) num_possible_values = prod(shape) # transform the 2d table into a 1d list str_table = list(chain.from_iterable(str_table)) if len(str_table) != num_possible_values: raise Exception( "incoherent data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % (fpath, len(str_table), num_possible_values, ' * '.join( str(len(values)) for values in possible_values))) # TODO: compare time with numpy built-in conversion: # if dtype is None, numpy tries to detect the best type itself # which it does a good job of if the values are already numeric values # if dtype is provided, numpy does a good job to convert from string # values. if celltype is None: celltype = detect_column_type(str_table) data = convert_1darray(str_table, celltype) array = np.array(data, dtype=celltype) return LabeledArray(array.reshape(shape), header, possible_values)