def process_files_column(infiles, options): """Processes the given files in ``column`` mode. Files will be processed sequentially. The output is a single line for each file where column i contains the result of the aggregation function for the column of the file. """ for idx, filename in enumerate(infiles): process_files_column_single(open_anything(filename), options, idx == 0)
def process_file(infile, options): """Processes the given file.""" # Calculate the column indices we are interested in if options.fields: col_idxs = [f - 1 for f in options.fields] else: col_idxs = None # Dictionary to map keys to values if options.unique: keys_to_values = defaultdict(set) else: keys_to_values = defaultdict(list) # Some caching to avoid costly lookups delim = options.in_delimiter fields = options.fields join = options.out_delimiter.join # Set up characters to strip from lines chars_to_strip = " \t\r\n" if options.strip else "\r\n" for line in open_anything(infile): # Split the input line parts = line.strip(chars_to_strip).split(delim) # Select the relevant columns only if col_idxs: parts = sublist(parts, col_idxs) # If the row is empty, continue if not parts: continue # Store the row to its appropriate key if options.unique: keys_to_values[parts[0]].update(parts[1:]) else: keys_to_values[parts[0]].extend(parts[1:]) # Print the key-value pairs for key, values in keys_to_values.items(): print(join(chain([key], values)))
def plot_file_on_figure(infile, figure, options): """Plots the dataset in the given file on the given figure.""" iterator = TableWithHeaderIterator( open_anything(infile), delimiter=options.delimiter, every=options.every, fields=options.fields, strip=options.strip, ) iterator.first_column_is_date = "x" in options.dates func = globals()["plot_%s_from_table_iterator" % options.type] func(iterator, figure, options) # Add the title if not options.no_title: if options.title is None: if infile != "-": figure.suptitle(infile) else: figure.suptitle(options.title)
def remap_file(infile, mapper, options): """Remaps the entries in the given file using the given callable mapper.""" for line in open_anything(infile): parts = line.strip().split(options.delimiter) new_parts = [] skip = False for idx, part in enumerate(parts, 1): try: if idx in options.fields: new_parts.append(mapper(part)) else: new_parts.append(part) except KeyError: raise UnknownIDError(part) except SkipColumnException: pass except SkipRowException: skip = True break if not skip: print(options.delimiter.join(new_parts))
def load_mapping(fname, options): """Loads a mapping from the given file and returns a dict-like object.""" if options.missing_action == "fail": data = {} elif options.missing_action == "warn": data = cautiousdict() elif options.missing_action == "skip": data = skippingdict(exc=SkipRowException) elif options.missing_action == "empty": data = skippingdict(exc=SkipColumnException) else: data = lenientdict() old, new = options.mapping_fields old -= 1 new -= 1 for row in open_anything(fname): parts = row.strip().split(options.mapping_delimiter) data[parts[old]] = parts[new] return data
def process_files_multiple(infiles, options): """Processes the given files in ``multiple`` mode. Files will be processed in parallel; row i of each file will be aggregated using the aggregation function into row i of the output.""" # Calculate the column indices we are interested in if options.fields: col_idxs = [f - 1 for f in options.fields] else: col_idxs = None # Some caching to avoid costly lookups delim = options.in_delimiter fields = options.fields func = options.function join = options.out_delimiter.join # Flag to denote whether we have seen at least one row with numbers. # If not, we are still processing the headers. data_started = False for lines in zip(*[open_anything(f) for f in infiles]): # Split the input line lines = [line.strip().split(delim) for line in lines] # Select the relevant columns only if col_idxs: lines = [sublist(line, col_idxs) for line in lines] if not data_started: # Check whether this row contains numbers only (at least in the # columns we are interested in) if any(not only_numbers(line) for line in lines): # This is a header, print it from the first file, assuming # that the remaining files contain the same header if hasattr(func, "argout"): headers = [] for header in lines[0]: headers.extend("%s_%s" % (header, arg) if arg else header for arg in func.argout) print(join(headers)) else: print(join(lines[0])) continue else: # Yay, finally real data! data_started = True # Convert the columns of interest to floats lines = [[float(x) for x in line] for line in lines] # Print the output row = [] for items in zip(*lines): result = func(items) if hasattr(result, "__iter__"): row.extend(str(item) for item in result) else: row.append(str(result)) print(join(row))