def main(): '''Output the intersection of 2 files, based on certain columns in the files. This is equivalent to the SQL JOIN operation. Loads the keys in memory, so only works on small/medium sized files. It outputs the full lines from the left file, optionally adding columns from the right file into it''' parent_argparser = file_ops_common.set_ops_parser() argparser = argparse.ArgumentParser(description=main.__doc__, parents=[parent_argparser]) argparser.add_argument('--insert-cols', dest='insert_cols', default='', help='Columns from the right file to insert into the left.' ' If there are multiple rows matching from the right file, we' ' only consider the 1st match row') args = argparser.parse_args() left_key_cols = [int(col) for col in args.left_columns.split(',')] right_key_cols = [int(col) for col in args.right_columns.split(',')] insert_cols = [int(col) for col in args.insert_cols.split(',')] if ( args.insert_cols) else [] # Go through the left file and collect the keys all_keys = dict() for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'), delimiter=args.left_delim): key = file_ops_common.get_key(cols, left_key_cols) if args.lower_case: key = key.lower() if key not in all_keys: all_keys[key] = [] all_keys[key].append(cols) output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim) # Right file for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'), delimiter=args.right_delim): key = file_ops_common.get_key(cols, right_key_cols) if args.lower_case: key = key.lower() if key in all_keys: for line in all_keys[key]: insert_values = [cols[i] for i in insert_cols] line = line + insert_values output.writerow(line) all_keys.pop(key)
def main(): '''Output the diff of 2 files, based on certain columns in the files. Loads the keys in memory, so only works on small/medium sized files. It outputs the entire line from the left file''' parent_argparser = file_ops_common.set_ops_parser() argparser = argparse.ArgumentParser(description=main.__doc__, parents=[parent_argparser]) args = argparser.parse_args() left_key_cols = [int(col) for col in args.left_columns.split(',')] right_key_cols = [int(col) for col in args.right_columns.split(',')] # We use an ordered dict to maintain the original order of the lines all_keys = OrderedDict() # Go through the left file and collect the keys for cols in csv_unicode.UnicodeReader(open(args.left_file, 'r'), delimiter=args.left_delim): key = file_ops_common.get_key(cols, left_key_cols) if args.lower_case: key = key.lower() if key not in all_keys: all_keys[key] = [] all_keys[key].append(cols) output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.left_delim) # Go through the right file and remove those keys from all_keys for cols in csv_unicode.UnicodeReader(open(args.right_file, 'r'), delimiter=args.right_delim): key = file_ops_common.get_key(cols, right_key_cols) if args.lower_case: key = key.lower() if key in all_keys: all_keys.pop(key, None) # Output the remaining keys for key in all_keys: for line in all_keys[key]: output.writerow(line)
def main(): ''' Do SQL-like operations on a delimited text file''' args = parse_args() # Parse the where clause where_filters = {} if args.where_clauses: for clause in args.where_clauses: if '!=' in clause: clause_parts = clause.split('!=') where_filters[int(clause_parts[0])] = { 'op' : 'NOT IN', 'vals' : clause_parts[1].split(',') } elif '=' in clause: clause_parts = clause.split('=') where_filters[int(clause_parts[0])] = { 'op' : 'IN', 'vals' : clause_parts[1].split(',') } select_cols = [int(col) for col in args.select_cols.split(',')] aggregate_cols = [int(col) for col in args.aggregate_cols.split(',')] if args.verbose: print_query(args, select_cols, aggregate_cols, where_filters) # The structure where we save the aggregated values # It is a nested dictionary for the form: # Select Keys -> Aggregate Column -> Aggregate value aggregates = {} # Go through the input file and do the aggregation for cols in csv_unicode.UnicodeReader(open(args.file, 'r'), delimiter=args.delim): # Where filters skip = False for col_num, filtr in where_filters.iteritems(): if ((filtr['op'] == 'IN' and cols[col_num] not in filtr['vals']) or (filtr['op'] == 'NOT IN' and cols[col_num] in filtr['vals'])): skip = True break if skip: continue # Generate the key from the select columns key = file_ops_common.get_key(cols, select_cols) if key not in aggregates: aggregates[key] = defaultdict(int) # Go over each aggregate column and add it to the final structure for col_num in aggregate_cols: if args.agg_function == 'sum': aggregates[key][col_num] += float(cols[col_num]) elif args.agg_function == 'count': aggregates[key][col_num] += 1 else: sys.stderr.write('Invalid aggregate function: ' + args.agg_function) sys.exit(-1) output = csv_unicode.UnicodeWriter(sys.stdout, delimiter=args.delim) # Output the remaining keys for key in aggregates: op_cols = file_ops_common.split_key(key) # We output the aggregate column in the order they are in the input file for col, value in sorted(aggregates[key].items()): op_cols += [unicode(value)] output.writerow(op_cols)