def lexicographical(row_ids, col_ids, sort_rows, sort_cols): dups = duplicates(row_ids) + duplicates(col_ids) if dups: return {'error_msg': 'Duplicate identifiers: {}'.format(dups)} rpart = argsort(row_ids) if sort_rows else list(irange(len(row_ids))) cpart = argsort(col_ids) if sort_cols else list(irange(len(col_ids))) rowp, colp = get_inverse_perm(rpart, cpart) return _pack(rowp, colp)
def to_bipartite_from_test_string(mat_str): # Unaware of the optional opt in the tuple (dm_decomp does not have opt) rows = mat_str[0].split() cols_rowwise = [line.split() for line in mat_str[1].splitlines()] # check rows for typos eqs = set(rows) assert len(eqs) == len(rows), (sorted(eqs), sorted(rows)) assert len(rows) == len(cols_rowwise) # check cols for typos all_cols = set(chain.from_iterable(cols for cols in cols_rowwise)) both_row_and_col = sorted(eqs & all_cols) assert not both_row_and_col, both_row_and_col # check cols for duplicates for r, cols in izip(rows, cols_rowwise): dups = duplicates(cols) assert not dups, 'Duplicate column IDs {} in row {}'.format(dups, r) #print(rows) #print(cols_rowwise) g = nx.Graph() g.add_nodes_from(rows) g.add_nodes_from(all_cols) g.add_edges_from( (r, c) for r, cols in izip(rows, cols_rowwise) for c in cols) assert is_bipartite_node_set(g, eqs) return g, eqs
def __add_arities(operators, arity): opcodes = operators[::2] dups = duplicates(opcodes) assert not dups, dups already_added = set(opcodes) & set(ARITY) assert not already_added, already_added ARITY.update(izip(opcodes, repeat(arity)))
def group_files_sat(self, manifest): """ Group all the satellite files from downloaded manifest :param manifest: satellite manifest from retrieving :return result: groupped and cleanned manifest """ if not manifest: return manifest result = Dict({}) geore = re.compile(r'%s' % self.geo_prefix) pregeore = re.compile(r'%s' % self.pre_geo_prefix) firere = re.compile(r'%s' % self.fire_prefix) keys = np.array(list(manifest.keys())) labels = np.array([''.join(k.split('.')[1:3]) for k in keys]) indexes = duplicates(labels) for k,ind in indexes.items(): lenind = len(ind) if lenind != 2: logging.warning('group_files_sat: number of geo and fire granules %d different than 2' % lenind) if lenind < 2: logging.error('group_files_sat: geo or fire file is missing, number of granules %d different than 2' % lenind) continue geo = list(filter(geore.search,keys[ind])) pregeo = list(filter(pregeore.search,keys[ind])) fire = list(filter(firere.search,keys[ind])) if not geo: if pregeo: geok = pregeo[0] else: logging.error('group_files_sat: no geo data in the manifest') continue else: geok = geo[0] if fire: firek = fire[0] else: logging.error('group_files_sat: no fire data in the manifest') continue logging.info('group_files_sat: %s - geo %s and fire %s' % (k,geok,firek)) try: r = Dict({ 'time_start_iso' : manifest[geok]['time_start'], 'time_end_iso' : manifest[geok]['time_end'], 'geo_url' : manifest[geok]['url'], 'geo_local_path' : manifest[geok]['local_path'], 'geo_description' : manifest[geok]['dataset_id'], 'fire_url' : manifest[firek]['url'], 'fire_local_path' : manifest[firek]['local_path'], 'fire_description' : manifest[firek]['dataset_id'] }) result.update({k: r}) except Exception as e: logging.error('group_files_sat: when creating manifest with error %s' % str(e)) continue return result
def load(self, fpath): from exprparser import parse with open(os.path.join(config.input_directory, fpath), "rb") as f: reader = csv.reader(f) lines = skip_comment_cells(strip_rows(reader)) header = lines.next() self.expressions = [parse(s, autovariables=True) for s in header] table = [] for line in lines: if any(value == "" for value in line): raise Exception("empty cell found in %s" % fpath) table.append([eval(value) for value in line]) ndim = len(header) unique_last_d, dupe_last_d = unique_duplicate(table.pop(0)) if dupe_last_d: print( "Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)) ) raise Exception( "bad alignment data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d)) ) # strip the ndim-1 first columns headers = [[line.pop(0) for line in table] for _ in range(ndim - 1)] possible_values = [list(unique(values)) for values in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print("Duplicate row header value(s) in '%s':" % fpath) print(PrettyTable(dupe_combos)) raise Exception( "bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos)) ) possible_values.append(unique_last_d) self.possible_values = possible_values self.probabilities = list(chain.from_iterable(table)) num_possible_values = prod(len(values) for values in possible_values) if len(self.probabilities) != num_possible_values: raise Exception( "incoherent alignment data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % ( fpath, len(self.probabilities), num_possible_values, " * ".join(str(len(values)) for values in possible_values), ) )
def load_ndarray(fpath, celltype=None): print(" - reading", fpath) with open(fpath, "rb") as f: reader = csv.reader(f) line_stream = skip_comment_cells(strip_rows(reader)) header = line_stream.next() str_table = [] for line in line_stream: if any(value == '' for value in line): raise Exception("empty cell found in %s" % fpath) str_table.append(line) ndim = len(header) # handle last dimension header (horizontal values) last_d_header = str_table.pop(0) # auto-detect type of values for the last d and convert them last_d_pvalues = convert_1darray(last_d_header) unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues) if dupe_last_d: print(("Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)))) raise Exception("bad data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))) # handle other dimensions header # strip the ndim-1 first columns headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)] headers = [convert_1darray(pvalues_str) for pvalues_str in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print(("Duplicate row header value(s) in '%s':" % fpath)) print((PrettyTable(dupe_combos))) raise Exception("bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))) possible_values = [np.array(list(unique(pvalues))) for pvalues in headers] possible_values.append(np.array(unique_last_d)) shape = tuple(len(values) for values in possible_values) num_possible_values = prod(shape) # transform the 2d table into a 1d list str_table = list(chain.from_iterable(str_table)) if len(str_table) != num_possible_values: raise Exception("incoherent data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % (fpath, len(str_table), num_possible_values, ' * '.join(str(len(values)) for values in possible_values))) #TODO: compare time with numpy built-in conversion: # if dtype is None, numpy tries to detect the best type itself # which it does a good job of if the values are already numeric values # if dtype is provided, numpy does a good job to convert from string # values. if celltype is None: celltype = detect_column_type(str_table) data = convert_1darray(str_table, celltype) array = np.array(data, dtype=celltype) return LabeledArray(array.reshape(shape), header, possible_values)
def load_ndarray(fpath, celltype=None): print(" - reading", fpath) with open(fpath, "rb") as f: reader = csv.reader(f) line_stream = skip_comment_cells(strip_rows(reader)) header = line_stream.next() str_table = [] for line in line_stream: if any(value == '' for value in line): raise Exception("empty cell found in %s" % fpath) str_table.append(line) ndim = len(header) # handle last dimension header (horizontal values) last_d_header = str_table.pop(0) # auto-detect type of values for the last d and convert them last_d_pvalues = convert_1darray(last_d_header) unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues) if dupe_last_d: print(("Duplicate column header value(s) (for '%s') in '%s': %s" % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d)))) raise Exception("bad data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))) # handle other dimensions header # strip the ndim-1 first columns headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)] headers = [convert_1darray(pvalues_str) for pvalues_str in headers] if ndim > 1: # having duplicate values is normal when there are more than 2 # dimensions but we need to test whether there are duplicates of # combinations. dupe_combos = list(duplicates(zip(*headers))) if dupe_combos: print(("Duplicate row header value(s) in '%s':" % fpath)) print((PrettyTable(dupe_combos))) raise Exception("bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))) possible_values = [np.array(list(unique(pvalues))) for pvalues in headers] possible_values.append(np.array(unique_last_d)) shape = tuple(len(values) for values in possible_values) num_possible_values = prod(shape) # transform the 2d table into a 1d list str_table = list(chain.from_iterable(str_table)) if len(str_table) != num_possible_values: raise Exception( "incoherent data in '%s': %d data cells " "found while it should be %d based on the number " "of possible values in headers (%s)" % (fpath, len(str_table), num_possible_values, ' * '.join( str(len(values)) for values in possible_values))) # TODO: compare time with numpy built-in conversion: # if dtype is None, numpy tries to detect the best type itself # which it does a good job of if the values are already numeric values # if dtype is provided, numpy does a good job to convert from string # values. if celltype is None: celltype = detect_column_type(str_table) data = convert_1darray(str_table, celltype) array = np.array(data, dtype=celltype) return LabeledArray(array.reshape(shape), header, possible_values)