Esempio n. 1
0
def filterBadValues(in_filename, out_filename, keys):
    print 'filterBadValues', in_filename, out_filename, keys
    fin = open(in_filename, 'rt')
    fout = open(out_filename, 'wt')

    header = csv.readCsvLine(fin)
    csv.writeCsvRow(fout, header)
    print header

    data = csv.readCsvGen(fin)

    column_index = dict(zip(keys, [header.index(k) for k in keys]))

    num_rows = 0
    num_bad = 0
    for row in data:
        bad_row = False
        for k in keys:
            val = float(row[column_index[k]])
            if val < 0:
                bad_row = True
                num_bad += 1
        if not bad_row:
            csv.writeCsvRow(fout, row)
        num_rows += 1

    fin.close()
    fout.close()

    print in_filename, num_rows, 'rows'
    print out_filename, num_rows - num_bad, 'rows'
Esempio n. 2
0
def sampleCsv(in_filename, out_filename, ratio):
    """ Sample a csv file. """
    print 'sampleCsv', in_filename, out_filename, ratio
    fin = open(in_filename, 'rt')
    fout = open(out_filename, 'wt')

    header = csv.readCsvLine(fin)
    print 'header:', header
    csv.writeCsvRow(fout, header)


    data = csv.readCsvGen(fin)

    num_sampled = 0

    for irow,row in enumerate(data):
        if irow % 100000 == 0:
            print (irow,num_sampled),
        if num_sampled < ratio * irow:
            csv.writeCsvRow(fout, row)
            num_sampled += 1
          
    print

    fin.close()
    fout.close()

    print in_filename, irow, 'rows'
    print out_filename, num_sampled, 'rows'
    
    if True:
        fin = open(out_filename, 'rt')
        header = csv.readCsvLine(fin)
        data = csv.readCsvGen(fin)
        for irow,row in enumerate(data):
            if len(row) != len(header):
                print irow, len(row), row, len(header), header
            assert(len(row) == len(header))
        fin.close()