def main(argv=None):
    if not argv:
        argv=sys.argv
    IN = argv[1]
    c = CSVWrapper(IN)
    
    c.apply_global_cell_function(detect_and_strip_duplicates)
def main(argv=None):
    if not argv:
        argv = sys.argv
    if len(sys.argv) == 3:
        CSV = sys.argv[1]
        if not CSV:
            print "Please supply a path to a csv file to process"
            exit()
        OUTDIR = sys.argv[2]
        if not OUTDIR:
            print "Please supply a path to an output directory"
            exit()
    else:
        print "Usage: python", sys.argv[0], "<csv_input> <output_dir>"
        print "First argument must be a path to a csv file to process"
        print "Second argument must be a path to a directory where the output files can be written (UNIX-style, ending in /)"
        exit()
        
    # load the csv
    print "Loading csv from " + CSV + " ..."
    c = CSVWrapper(CSV)
    print "done"
    print

    for col in interesting_cols.keys():
        print 'Saving interesting data wrt column', col, 'to', OUTDIR + filenames[col] 
        print 'Slicing off these columns:', ', '.join(interesting_cols[col])
        
        filtered_rows = c.filter_rows(col, should_be_empty=False)
        
        c.save(OUTDIR + filenames[col], interesting_cols[col], filtered_rows)
        
        print '... done'
        print

    for col, csv in filenames.items():
        tmp = CSVWrapper(OUTDIR + csv)
        print 'Number of items in', csv, '=', len(tmp.csv_dict['id']), '.'
Esempio n. 3
0
def roll(csv_rows, pivot_col, target_col):
    c = CSVWrapper()
    first = True
    c.csv_dict = {}
    header_index_map = {}
    for row in csv_rows:
        for i in range(
                len(row)):  # be explicit about reading the indices of the row
            if first:
                c.csv_dict[row[i]] = {}
                header_index_map[i] = row[i]
            else:
                key = header_index_map[i]
                c.csv_dict[key].setdefault(int(row[0]), [])
                c.add_value(key, int(row[0]), *c._tokenise(row[i]))
        if first:
            first = False

    c.populate_ids()
    return c
Esempio n. 4
0
def roll(csv_rows, pivot_col, target_col):
    c = CSVWrapper()
    first = True
    c.csv_dict = {}
    header_index_map = {}
    for row in csv_rows:
        for i in range(len(row)): # be explicit about reading the indices of the row
            if first:
                c.csv_dict[row[i]] = {}
                header_index_map[i] = row[i]
            else:
                key = header_index_map[i]
                c.csv_dict[key].setdefault(int(row[0]), [])
                c.add_value(key, int(row[0]), *c._tokenise(row[i]))
        if first: 
            first = False
            
    c.populate_ids()
    return c
def main(argv=None):
    if not argv:
        argv = sys.argv
    if len(sys.argv) == 3:
        CSV = sys.argv[1]
        if not CSV:
            print "Please supply a path to a csv file to process"
            exit()
        OUTDIR = sys.argv[2]
        if not OUTDIR:
            print "Please supply a path to an output directory"
            exit()
    else:
        print "Usage: python", sys.argv[0], "<csv_input> <output_dir>"
        print "First argument must be a path to a csv file to process"
        print "Second argument must be a path to a directory where the output files can be written (UNIX-style, ending in /)"
        exit()

    # load the csv
    print "Loading csv from " + CSV + " ..."
    c = CSVWrapper(CSV)
    print "done"
    print

    for col in interesting_cols.keys():
        print 'Saving interesting data wrt column', col, 'to', OUTDIR + filenames[
            col]
        print 'Slicing off these columns:', ', '.join(interesting_cols[col])

        filtered_rows = c.filter_rows(col, should_be_empty=False)

        c.save(OUTDIR + filenames[col], interesting_cols[col], filtered_rows)

        print '... done'
        print

    for col, csv in filenames.items():
        tmp = CSVWrapper(OUTDIR + csv)
        print 'Number of items in', csv, '=', len(tmp.csv_dict['id']), '.'
    OUTDIR = sys.argv[2]
    if not OUTDIR:
        print "Please supply a path to an output directory"
        exit()
else:
    print "Usage: python", sys.argv[0], "<input_dir> <output_dir>"
    print "First argument must be a path to a directory. The script will process the following filenames (produced by helpman.py): " + ", ".join(
        filenames)
    print "Second argument must be a path to a directory where the output files can be written (UNIX-style, ending in /)"
    exit()

csvs = {}
for f in filenames:
    # load the csvs
    print "Loading csv from ", INDIR + f, " ..."
    csvs[f] = CSVWrapper(INDIR + f)
    print "done"
    print

for filename, c in csvs.items():
    print 'Processing', filename
    print 'Deleting these columns:', ', '.join(delete_cols[filename])

    for col in delete_cols[filename]:
        c.delete_column(col)

    print 'Saving', OUTDIR + filename,
    c.save(OUTDIR + filename)
    print '... done'
    print
Esempio n. 7
0
def main(argv=None):
    if not argv:
        argv = sys.argv
    
    if len(sys.argv) == 4:
        argument_parse_error = \
        """ERROR: Please specify which file you want to work on, which is the pivot column and which is the target column. Your first argument should have this format: <pivot_column>,<target_column>:<input_file>.csv"""
        
        try:
            OPERATION = sys.argv[1]
            
            IN = sys.argv[2]
            IN = IN.split(':')
            PIVOT_COL = IN[0].split(',')[0]
            TARGET_COL = IN[0].split(',')[1]
            IN = IN[1]
            
            OUT = sys.argv[3]
                
        except IndexError:
            print argument_parse_error
            exit()
            
        if not PIVOT_COL or not TARGET_COL or not IN:
            print argument_parse_error
            exit()
            
    else:
        print "Usage: python", sys.argv[0], "(flatten|roll) <pivot_column>,<target_column>:<input_file>.csv <output_filename>.csv"
        print "1. First argument is which operation to perform on the data - flatten or roll up"
        print "2. The second argument has three components. Pivot column is the column which identifies individual records (usually something like 'id'). Target column is the one that you want flattened / rolled up. Finally, the input file is a UNIX-style path to the CSV you want to process."
        print "3. Third argument is a UNIX-style path to an output CSV file"
        exit()
        
    if OPERATION not in ['flatten', 'roll']:
        print 'ERROR: Operation to be performed needs to be either "flatten" or "roll" (no quotes)'
        exit()
    
    if OPERATION == 'flatten':
        print 'FLATTENING'
        
    elif OPERATION == 'roll':
        print 'ROLLING UP'
        
    print 'Loading', IN,
    
    if OPERATION == 'flatten':
        c = CSVWrapper(IN)
        
    elif OPERATION == 'roll':
        with open(IN, 'rb') as f:
            r = csv.reader(f)
            c = []
            for row in r:
                c.append(row)
                
    print '... done.'
    
    if OPERATION == 'flatten':
        print 'Flattening',
        c.filter_columns(PIVOT_COL, TARGET_COL)
        
        csv_rows = [[PIVOT_COL, TARGET_COL]] # results header row
        csv_rows += flatten(c, PIVOT_COL, TARGET_COL)
        
    elif OPERATION == 'roll':
        print 'Rolling up',
        c_out = roll(c, PIVOT_COL, TARGET_COL)
        
    print 'with pivot:', PIVOT_COL, 'and target:', TARGET_COL

    print 'Saving', OUT,
    
    if OPERATION == 'flatten':
        with open(OUT, "wb") as f:
            writer = csv.writer(f)
            writer.writerows(csv_rows)
            
    elif OPERATION == 'roll':
        c_out.save(OUT)

    print '... done.'
Esempio n. 8
0
def main(argv=None):
    if not argv:
        argv = sys.argv

    if len(sys.argv) == 4:
        argument_parse_error = \
        """ERROR: Please specify which file you want to work on, which is the pivot column and which is the target column. Your first argument should have this format: <pivot_column>,<target_column>:<input_file>.csv"""

        try:
            OPERATION = sys.argv[1]

            IN = sys.argv[2]
            IN = IN.split(':')
            PIVOT_COL = IN[0].split(',')[0]
            TARGET_COL = IN[0].split(',')[1]
            IN = IN[1]

            OUT = sys.argv[3]

        except IndexError:
            print argument_parse_error
            exit()

        if not PIVOT_COL or not TARGET_COL or not IN:
            print argument_parse_error
            exit()

    else:
        print "Usage: python", sys.argv[
            0], "(flatten|roll) <pivot_column>,<target_column>:<input_file>.csv <output_filename>.csv"
        print "1. First argument is which operation to perform on the data - flatten or roll up"
        print "2. The second argument has three components. Pivot column is the column which identifies individual records (usually something like 'id'). Target column is the one that you want flattened / rolled up. Finally, the input file is a UNIX-style path to the CSV you want to process."
        print "3. Third argument is a UNIX-style path to an output CSV file"
        exit()

    if OPERATION not in ['flatten', 'roll']:
        print 'ERROR: Operation to be performed needs to be either "flatten" or "roll" (no quotes)'
        exit()

    if OPERATION == 'flatten':
        print 'FLATTENING'

    elif OPERATION == 'roll':
        print 'ROLLING UP'

    print 'Loading', IN,

    if OPERATION == 'flatten':
        c = CSVWrapper(IN)

    elif OPERATION == 'roll':
        with open(IN, 'rb') as f:
            r = csv.reader(f)
            c = []
            for row in r:
                c.append(row)

    print '... done.'

    if OPERATION == 'flatten':
        print 'Flattening',
        c.filter_columns(PIVOT_COL, TARGET_COL)

        csv_rows = [[PIVOT_COL, TARGET_COL]]  # results header row
        csv_rows += flatten(c, PIVOT_COL, TARGET_COL)

    elif OPERATION == 'roll':
        print 'Rolling up',
        c_out = roll(c, PIVOT_COL, TARGET_COL)

    print 'with pivot:', PIVOT_COL, 'and target:', TARGET_COL

    print 'Saving', OUT,

    if OPERATION == 'flatten':
        with open(OUT, "wb") as f:
            writer = csv.writer(f)
            writer.writerows(csv_rows)

    elif OPERATION == 'roll':
        c_out.save(OUT)

    print '... done.'
Esempio n. 9
0
        rules.rule13a_publisher, rules.rule13b_publisher, rules.rule13c_publisher,
        # LOM
        rules.rule14a_lom,
        # Merge results from manual processing
        rules.rule15a_mergemanual, rules.rule15b_mergemanual,
        # general tidying
        rules.rule16a_general, rules.rule16c_general, rules.rule16d_general, 
        rules.rule16e_general, rules.rule16f_general
    ]

draft_only_rules = []
release_only_rules = [rules.rule13c_publisher, rules.rule15a_mergemanual, rules.rule15b_mergemanual, rules.rule16c_general, rules.rule16f_general]

# load the csv
print "Loading csv from " + CSV + " ..."
csv_wrapper = CSVWrapper(CSV)
print "done"
print

if MAKE_RELEASE and not RULE:
    runrules = release_only_rules

# run through all the rules, passing in the wrapper each time
count_run = 0
for rule in runrules:

    # skip certain rules based on whether we're making a draft or release version
    if MAKE_RELEASE:
        if rule in draft_only_rules:
            continue
    else: # we're doing a draft