Beispiel #1
0
def getAllValueCounts(filename, keys):
    print 'getAllValueCounts', filename, keys
    f = open(filename, 'rt')
    header = csv.readCsvLine(f)
    data = csv.readCsvGen(f)
    print header

    column_index = dict(zip(keys, [header.index(k) for k in keys]))
    counts = dict(zip(keys, [{} for k in keys]))
    print 'indexes', column_index
    print ' counts', counts

    num_lines = 0
    for row in data:
        num_lines += 1
        for k in keys:
            val = row[column_index[k]]
            counts[k][val] = counts[k].get(val,0) + 1

    print filename, num_lines, 'lines'
    for k in keys:
        val = counts[k]
        print k, len(val), val
        total = sum(val.values())
        cumulative = 0.0
        if True:
            for v in sorted(val.keys(), key = lambda x: -val[x]):
                percent = val[v]*100.0/total
                cumulative += percent
                print '%5s %8d %3d%% %3d%%' % (v, val[v], round(percent), round(cumulative))
        print '%5s %8d %3d%% %3d%%' % ('total', total, round(sum([v*100.0/total for v in val.values()])), round(cumulative))
    f.close()
    return counts
Beispiel #2
0
def getAllStats(filename, keys, max_rows = sys.maxint):
    print 'getAllStats', filename, keys
    f = open(filename, 'rt')
    header = csv.readCsvLine(f)
    data = csv.readCsvGen(f)
    print header

    column_index = dict(zip(keys, [header.index(k) for k in keys]))
    stats = dict(zip(keys, [{'lo':sys.maxint, 'hi':-sys.maxint, 'mean': 0} for k in keys]))

    num_rows = 0
    for row in data:
        for k in keys:
            val = float(row[column_index[k]])
            s = stats[k]
            if stats[k]['lo'] > val:
                stats[k]['lo'] = val
            if stats[k]['hi'] < val:
                stats[k]['hi'] = val
            stats[k]['mean'] += val
        num_rows += 1
        if num_rows > max_rows:
            break
    for k in keys:
        stats[k]['mean'] = stats[k]['mean']/num_rows 

    print filename, num_rows, 'rows'
    for k in keys:
        print k, stats[k]
    f.close()
    return stats
Beispiel #3
0
def filterBadValues(in_filename, out_filename, keys):
    print 'filterBadValues', in_filename, out_filename, keys
    fin = open(in_filename, 'rt')
    fout = open(out_filename, 'wt')

    header = csv.readCsvLine(fin)
    csv.writeCsvRow(fout, header)
    print header

    data = csv.readCsvGen(fin)

    column_index = dict(zip(keys, [header.index(k) for k in keys]))

    num_rows = 0
    num_bad = 0
    for row in data:
        bad_row = False
        for k in keys:
            val = float(row[column_index[k]])
            if val < 0:
                bad_row = True
                num_bad += 1
        if not bad_row:
            csv.writeCsvRow(fout, row)
        num_rows += 1

    fin.close()
    fout.close()

    print in_filename, num_rows, 'rows'
    print out_filename, num_rows - num_bad, 'rows'
Beispiel #4
0
def getStats(filename):
    f = open(filename, 'rt')
    header = csv.readCsvLine(f)
    lines = csv.readCsvGen(f)
    num_rows = 0
    for l in lines:
        num_rows += 1
    print filename, num_rows, 'lines'
    f.close()
Beispiel #5
0
def sampleCsv(in_filename, out_filename, ratio):
    """ Sample a csv file. """
    print 'sampleCsv', in_filename, out_filename, ratio
    fin = open(in_filename, 'rt')
    fout = open(out_filename, 'wt')

    header = csv.readCsvLine(fin)
    print 'header:', header
    csv.writeCsvRow(fout, header)


    data = csv.readCsvGen(fin)

    num_sampled = 0

    for irow,row in enumerate(data):
        if irow % 100000 == 0:
            print (irow,num_sampled),
        if num_sampled < ratio * irow:
            csv.writeCsvRow(fout, row)
            num_sampled += 1
          
    print

    fin.close()
    fout.close()

    print in_filename, irow, 'rows'
    print out_filename, num_sampled, 'rows'
    
    if True:
        fin = open(out_filename, 'rt')
        header = csv.readCsvLine(fin)
        data = csv.readCsvGen(fin)
        for irow,row in enumerate(data):
            if len(row) != len(header):
                print irow, len(row), row, len(header), header
            assert(len(row) == len(header))
        fin.close()
Beispiel #6
0
def populateHistogram(filename, histo, max_rows):
    print 'populateHistogram', histo.keys()
    f = open(filename, 'rU')
    header = csv.readCsvLine(f)
    data = csv.readCsvGen(f)
    column_index = dict(zip(header, [header.index(k) for k in header]))

    for i,row in enumerate(data):
        for k in histo.keys():
            x = float(row[column_index[k]])
            bin = binarySearch(histo[k]['levels'], x)
            histo[k]['counts'][bin] += 1
        if i >= max_rows:
            break
    f.close()
    
    sales_histo = histo['sales']
    for i,count in enumerate(sales_histo['counts']):
        print '%4d: %7d %8.2f %8.2f' % (i, count, sales_histo['levels'][i], sales_histo['levels'][i+1]-sales_histo['levels'][i])
        assert(sales_histo['levels'][i+1] >= sales_histo['levels'][i])
Beispiel #7
0
def getHistogram(filename, keys, stats, max_rows = sys.maxint):
    """ Return a histogram of the form
        [(upper<i>, count<i>) for i=1..N]
    """
    print 'getHistogram', filename, keys, stats

    # Max equal width bins
    num_bins = 10 
    histo = dict(zip(keys, 
                 [{'counts':[0 for i in range(num_bins)],
                   'levels':[stats[k]['lo'] + i *(stats[k]['hi']-stats[k]['lo']) for i in range(num_bins+1)]}
                  for k in keys]))
    populateHistogram(filename, histo, max_rows)
    
    for num_bins in [20,40]:
        histo = makeHistogram(histo, num_bins)
        populateHistogram(filename, histo, max_rows)
        
    return histo

    for i,row in enumerate(data):
        for k in keys:
            val = float(row[column_index[k]])
            bin = int((num_bins-1)*(val-stats[k]['lo'])/(stats[k]['hi']-stats[k]['lo']))
            histo[k][bin] += 1
        if i >= max_rows:
            break
    f.close()
    num_rows = i
    print 'read', num_rows, 'to make', num_bins, 'equal depth bins'
    for k in keys:
        print k, stats[k]['lo'], stats[k]['hi'], histo[k]

    num_equal = 10
    equal_depth = dict(zip(keys, [[None for i in range(num_equal)] for k in keys]))

    # Make equal depth bins

    for k in keys:
        bin_num = 0
        cumulative = 0
        for i in range(num_equal):
            while cumulative/num_rows < i/num_equal:
               cumulative += histo[k][bin_num]
               bin_num += 1
            equal_depth[k][i] = [cumulative, 0]
            # print '  ', i, bin_num, cumulative
        print 'bin_num', bin_num, ' len(histo)', len(histo[k])
        assert(bin_num <= len(histo[k])-1)

    if False:
        f = open(filename, 'rU')
        header = csv.readCsvLine(f)
        data = csv.readCsvGen(f)
    
        for k in keys:
            print k, stats[k]['lo'], stats[k]['hi'], equal_depth[k]
        for i,row in enumerate(data):
            for k in ['sales']: # keys:
                val = float(row[column_index[k]])
                bin_num = binarySearch(equal_depth[k], val)
                equal_depth[k][bin_num][1] += 1
                print bin_num, val
            if i >= max_rows:
                break
        f.close()
        
        for k in keys:
            print '&&', k, len(equal_depth[k])
    
        return equal_depth