def awk_filter_map(data_desc, filter_strs, map_strs): """ >>> from tabkit.header import parse_header >>> awk, desc = awk_filter_map( ... parse_header('# d p e s c m'), ... ['e==157 and (s>100 or s in [15,30,45])'], ... ['ctr=c/s', 'cpm=ctr*m'] ... ) >>> print desc DataDesc([DataField('ctr', 'any'), DataField('cpm', 'any')]) >>> print awk.cmd_line() LC_ALL=C awk -F $'\\t' 'BEGIN{OFS="\\t";}{if((($3 == 157) && (($4 > 100) || (($4 == 15) || ($4 == 30) || ($4 == 45))))){ctr = ($5 / $4);print(ctr,(ctr * $6));}}' >>> awk, desc = awk_filter_map(parse_header('# a b'), [], ['__all__']) >>> print desc DataDesc([DataField('a', 'any'), DataField('b', 'any')]) """ ctx = ExprContext(data_desc) # parse map for map_expr_str in map_strs: for node in parse(map_expr_str).body: if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__all__': for field in data_desc.fields: ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name))) elif isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name) and node.value.id == '__rest__': for field in data_desc.fields: if not ctx.has_var(field.name): ctx.set_var(field.name, RowExprAssign(field.name, RowExprField(ctx, field.name))) else: expr = parse_rowexpr(ctx, node) ctx.set_var(expr.target, expr) # parse filter nodes = [node for filter_str in filter_strs for node in parse(filter_str).body] filter_expr = None if len(nodes) == 0: pass elif len(nodes) == 1: filter_expr = parse_expr(ctx, nodes[0]) else: filter_expr = RowExprOp('&&', [parse_expr(ctx, node) for node in nodes]) awk_cmd, output_desc = awk_filter_map_from_context(ctx, filter_expr, data_desc.order) if output_desc: output_desc.meta = data_desc.meta return awk_cmd, output_desc or data_desc
def awk_grp(data_desc, key_str, grp_expr_tuples, output_only_assigned=True, expose_groups=False): namer = Namer() acc_maker = GrpExprFuncMaker('__acc_', namer) grp_maker = GrpExprFuncMaker('__grp_', namer) key_ctx = ExprContext(data_desc, namer) row_ctx = ExprContext(data_desc, namer) acc_ctx = ExprContext(DataDesc([],[]), namer) grp_ctx = ExprContext(DataDesc([],[]), namer) out_ctx = ExprContext(DataDesc([],[]), namer) # parse key expr keys = [] key_ins_pos = 0 for node in parse(key_str or '1').body: assigned_name = None if isinstance(node, _ast.Assign): if len(node.targets) != 1 or not isinstance(node.targets[0], _ast.Name): raise Exception('Bad assignment in %r' % (key_str,)) expr = parse_expr(key_ctx, node.value) assigned_name = node.targets[0].id else: expr = parse_expr(key_ctx, node) if output_only_assigned: assign = False else: if isinstance(node, _ast.Expr) and isinstance(node.value, _ast.Name): assigned_name = node.value.id else: raise Exception('Please assign expression to a variable in %r' % (key_str,)) key_name = namer.get_name('__key', expr.tostr()) key_row_name = namer.get_name('__row_key', expr.tostr()) if assigned_name: out_ctx.set_var( key_name, RowExprAssign(key_name, expr) ) out_ctx.set_var( assigned_name, RowExprAssign(assigned_name, RowExprVar(out_ctx, key_name)), insert_at = key_ins_pos ) key_ins_pos += 1 grp_ctx.set_var( assigned_name, RowExprAssign(assigned_name, RowExprVar(out_ctx, key_name)), ) if isinstance(expr, RowExprField): # force str assuming if node is field expr = RowExprOp('', [expr, RowExprConst("")]) keys.append((expr, key_name, key_row_name)) for grp_type, expr_str in grp_expr_tuples: for ast_expr in parse(expr_str).body: if grp_type == 'acc': expr = parse_assign_grpexpr(acc_ctx, ast_expr, row_ctx, acc_maker) acc_ctx.set_var(expr.target, expr) out_ctx.set_var(expr.target, expr) elif grp_type == 'grp': expr = parse_assign_grpexpr(grp_ctx, ast_expr, row_ctx, grp_maker) grp_ctx.set_var(expr.target, expr) out_ctx.set_var(expr.target, expr) else: raise Exception('Unknown grouping type %r' % (grp_type,)) # construct awk script print_awk, output_desc = awk_filter_map_from_context( out_ctx, order = data_desc.order, ) if output_desc is None: raise Exception('No output fields specified') assert not print_awk.end init_grps = AwkBlock() init_accs = AwkBlock() calc_row_keys = AwkBlock() keys_changed = [] update_keys = AwkBlock() update_grps = AwkBlock() update_accs = AwkBlock() end_grps = AwkBlock() for expr, name, row_name in keys: calc_row_keys.append(row_name + ' = ' + expr.tostr()) update_keys.append(name + ' = ' + row_name) keys_changed.append(name + '!=' + row_name) for name, val in find_grp_funcs(grp_ctx): init_grps.append(val.init_str()) update_grps.extend(val.update_str(recursive=True)) end_grps.append(val.end_str()) for name, val in find_grp_funcs(acc_ctx): init_accs.append(val.init_str()) update_accs.extend(val.update_str(recursive=True)) end_grps.append(val.end_str()) keys_changed_str = ' || '.join(keys_changed) awk = AwkScript( begin = ( print_awk.begin + init_grps + init_accs + AwkBlock(['__print_last = ' + str(int(key_str == None))]) ), end = AwkBlock() if expose_groups else AwkBlock([AwkHeadBlock( 'if(NR!=0 || __print_last==1)', end_grps + print_awk.main )]), main = ( calc_row_keys + AwkHeadBlock('if(NR==1)', update_keys) + AwkHeadBlock('else', AwkBlock([ AwkHeadBlock('if(' + keys_changed_str + ')', end_grps + (print_awk.main if not expose_groups else AwkBlock()) + update_keys + init_grps )]) ) + update_grps + update_accs + (print_awk.main if expose_groups else AwkBlock()) ) ) return awk, output_desc