def main(self): if not self.args.pivot_agg or self.args.pivot_agg == 'list': print_available_aggregates(self.output_file) return self.handle_standard_args() agtable = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, **self.reader_kwargs) column_names = agtable.column_names _a = parse_aggregate_string_arg(self.args.pivot_agg, valid_columns=column_names) pivot_agg = _a.foo(*_a.args) _prow_ids = parse_column_identifiers( self.args.pivot_rows, column_names, column_offset=self.get_column_offset(), excluded_columns=None, ) if self.args.pivot_rows else None pivot_row_names = [column_names[i] for i in _prow_ids] if _prow_ids else None _pcol_ids = parse_column_identifiers( self.args.pivot_column, column_names, column_offset=self.get_column_offset(), excluded_columns=None, ) if self.args.pivot_column else None if _pcol_ids and len(_pcol_ids) > 1: raise ArgumentErrorTK( f'Only one --pivot-column is allowed, not {len(_pcol_ids)}: {_pcol_ids}' ) else: pivot_col_name = column_names[_pcol_ids[0]] if _pcol_ids else None # print(f"{column_names=}") # print(f"{pivot_col_name=}\n{pivot_row_names=}") pivot = agtable.pivot(key=pivot_row_names, pivot=pivot_col_name, aggregation=pivot_agg) pivot.to_csv(self.output_file, **self.writer_kwargs)
def main(self): if self.args.aggregates == ['list'] or self.args.aggregates == ['']: print_available_aggregates(self.output_file) return self.handle_standard_args() rawtable = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, **self.reader_kwargs) column_names = rawtable.column_names self.aggregates = self.handle_aggregate_args( valid_columns=column_names) _gcol_ids = parse_column_identifiers( self.args.columns, column_names, column_offset=self.get_column_offset(), excluded_columns=None, ) group_colnames = [column_names[i] for i in _gcol_ids] # outtable = rawtable.group_by(key=pivot_row_names, pivot=pivot_col_name, aggregation=pivot_agg) gtable = rawtable for col in group_colnames: gtable = gtable.group_by(key=col) g_aggs = [] for a in self.aggregates: if a.colname: colname = a.colname else: if a.args: colname = f'{a.foo.__name__}_of_{slugify(a.args)}' else: colname = a.foo.__name__ agg = a.foo(*a.args) g_aggs.append((colname, agg)) xtable = gtable.aggregate(g_aggs) xtable.to_csv(self.output_file, **self.writer_kwargs)
def filter_rows( rows: typeIterable, pattern_str: str, columns_str: str, column_names: list, default_column_ids: list, literal_match: bool, column_offset: int, inverse: bool, any_match: bool, # not_columns, ) -> FilteringCSVReader: if literal_match: pattern = pattern_str else: # literal match pattern = re.compile(pattern_str) if columns_str: expr_col_ids = parse_column_identifiers( columns_str, column_names, column_offset, ) else: expr_col_ids = default_column_ids epatterns = dict((eid, pattern) for eid in expr_col_ids) filtered_rows = FilteringCSVReader( rows, header=False, patterns=epatterns, inverse=inverse, any_match=any_match, ) return filtered_rows
def main(self): self.statements = self.handle_statements() self.handle_standard_args() # _input_name = self.args.input_path if self.args.input_path else 'stdin' # sys.stderr.write(f'{_input_name=}\n') def _build_conditional_foo(column_names, operator, operand, idnum) -> typeTuple: """ returns: 'all(row[col] [OPERATOR] [OPERAND] for col in column_names)', {column_names_idnum: column_names operator_idnum=operator, operand_idnum=operand} """ operand_label = f"operand_{idnum}" colnames_label = f"column_names_{idnum}" funcstr = f"""all(row[col] {operator} {operand_label} for col in {colnames_label})""" funclocals = {operand_label: operand, colnames_label: column_names} return (funcstr, funclocals) rawtable = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, **self.reader_kwargs) column_names = rawtable.column_names # _input_name = self.args.input_path if self.args.input_path else 'stdin' # sys.stderr.write(f'{_input_name=}\n') # sys.stderr.write("Statements:\n") # for c in self.statements: # sys.stderr.write(f"\t{c}\n") func_str = "" func_locals = {} for i, state in enumerate(self.statements): _col_ids = parse_column_identifiers( state['columns'], column_names, column_offset=self.get_column_offset(), excluded_columns=None) state_colnames = [column_names[i] for i in _col_ids] if _col_ids else None if not state_colnames: raise ValueError( f"Did not find valid column names in: {state['columns']}") opvalue = _rough_cast_value(state['operand'], state['datatype']) fstr, flocals = _build_conditional_foo(state_colnames, state['operator'], opvalue, i) func_str += fstr if state[ 'bool_type'] == 'FIRST' else f" {state['bool_type'].lower()} {fstr}" func_locals.update(flocals) # sys.stderr.write(f'{func_str=}\n\n') # for k, v in func_locals.items(): # sys.stderr.write(f'{k}: {v}\n') xrows = [] if rawtable._row_names is not None: rrow_names = [] else: rrow_names = None for i, row in enumerate(rawtable._rows): row_locals = func_locals.copy() row_locals.update({'row': row}) test_row = eval(func_str, row_locals) if test_row: xrows.append(row) if rrow_names is not None: rrow_names.append(rawtable._row_names[i]) xtable = rawtable._fork(xrows, row_names=rrow_names) # xtable = xtable.where(lambda row: ) # # xtable = gtable.aggregate(g_aggs) xtable.to_csv(self.output_file, **self.writer_kwargs)
class CSVSed(JustTextUtility): description = """Replaces all instances of [PATTERN] with [REPL]""" override_flags = ["f", "L", "blanks", "date-format", "datetime-format"] def add_arguments(self): self.argparser.add_argument( "-c", "--columns", dest="columns", help= 'A comma separated list of column indices, names or ranges to be searched, e.g. "1,id,3-5".', ) self.argparser.add_argument( "-E", "--expr", dest="expressions_list", # required=True, nargs="*", action="append", type=str, help=r""" When you want to do multiple sed_expressions: -E 'PATTERN' 'REPL' '[names_of_columns]' 'names_of_columns' is a comma-delimited list of columns; it cannot refer to columns *not included* in the `-c/--columns` flag; leave blank to match all columns e.g. -E '(?i)\b(bob|bobby|rob)\b' 'Robert' 'first_name' \ -E '^(?i)smith$' 'SMITH' 'last_name' \ -E '(\d{2})-(\d{3})' '$1:$2' '' \ """, ) self.argparser.add_argument( "-m", "--match-literal", dest="literal_match", action="store_true", default=False, help= "By default, [PATTERN] is assumed to be a regex. Set this flag to make it a literal text find/replace", ) self.argparser.add_argument( "-G", "--like-grep", dest="like_grep", action="store_true", default=False, help= """Only return rows in which [PATTERN] was a match (BEFORE any transformations) – i.e. like grep''s traditional behavior""", ) self.argparser.add_argument( "-R", "--replace", dest="replace_value", action="store_true", default=False, help= "Replace entire field with [REPL], instead of just the substring matched by [PATTERN]", ) self.argparser.add_argument( "--max", dest="max_match_count", action="store", default=0, type=int, help= "Max number of matches to replace PER FIELD. Default is 0, i.e. no limit", ) self.argparser.add_argument( metavar="PATTERN", dest="first_pattern", type=str, # nargs='?', help="A pattern to search for", ) self.argparser.add_argument( metavar="REPL", dest="first_repl", type=str, # nargs='?', help="A replacement pattern", ) self.argparser.add_argument( metavar="FILE", nargs="?", dest="input_path", help= "The CSV file to operate on. If omitted, will accept input as piped data via STDIN.", ) def run(self): """ A wrapper around the main loop of the utility which handles opening and closing files. TK: This is copy-pasted form CSVKitUtil because we have to override 'f'; maybe there's a way to refactor this... csvsed has special functionality, in which the presence of `-E/--expr` changes the command signature, i.e. from: csvsed PATTERN REPL input.csv to: csvsed -E 'PATTERN' 'REPL' 'COLUMNS' -E x y z input.csv """ self.last_expr = [] if not self.args.input_path: # then it must have been eaten by an -E flag; we assume the input file is in last_expr[-1], # where `last_expr` is the last member of expressions_list # TODO: THIS IS CRAP if self.args.expressions_list: self.last_expr = self.args.expressions_list[-1] if len(self.last_expr) > 2: # could be either 3 or 4 self.args.input_path = self.last_expr.pop() elif len(self.last_expr) == 2: pass # do nothing, but be warned that if there is no stdin, # then -E might have eaten up the input_file argument # and interpreted it as pattern else: # else, last_expr has an implied third argument, and # input_path is hopefully stdin self.args.input_path = None # # # error handling # # if self.args.pattern or self.args.repl: # # self.argparser.error("If using -E/--expr, [PATTERN] and [REPL] arguments cannot be filled in") # # if not self.args.input_path and self.args.pattern and not self.args.repl: # # self.args.input_path = self.args.pattern # # delattr(self.args, 'pattern') # # delattr(self.args, 'repl') # # elif self.args.input_path and self.args.pattern: # # # if input_path was given AND self.args.pattern (i.e. any other positional args besides INPUT_PATH) # # self.argparser.error(f"""Got an unexpected positional argument; either: # # - More than 3 arguments for -E/--expr {exes[-1]} # # - Or, a PATTERN argument, which is invalid when using -E/--expr # # """) # # else: # # self.argparser.error("Some other unhandled positional arg thingy [TODO]") # q self.input_file = self._open_input_file(self.args.input_path) try: with warnings.catch_warnings(): if getattr(self.args, "no_header_row", None): warnings.filterwarnings( action="ignore", message="Column names not specified", module="agate", ) self.main() finally: self.input_file.close() def _handle_sed_expressions(self) -> typeList: # TODO: fix this spaghetti CRAP: maybe make expressions handle dicts/named typles instead of lists first_col_str = self.args.columns if self.args.columns else "" first_expr = [ self.args.first_pattern, self.args.first_repl, first_col_str ] expressions = [first_expr] if list_expressions := getattr(self.args, "expressions_list", []): for i, _e in enumerate(list_expressions): ex = _e.copy() if len(ex) < 2 or len(ex) > 3: self.argparser.error( f"-E/--expr takes 2 or 3 arguments; you provided {len(ex)}: {ex}" ) if len(ex) == 2: ex.append(first_col_str) expressions.append(ex) for ex in expressions: # this branch re-loops through the_expressions and fixes any leading dashes in the repls if ex[1][0:2] == r"\-": ex[1] = ex[1][1:] # compile the pattern into a regex if not self.literal_match_mode: ex[0] = re.compile(ex[0]) # set the column_ids ex[2] = parse_column_identifiers(ex[2], self.all_column_names, self.column_offset, None) return expressions