def getAllValueCounts(filename, keys): print 'getAllValueCounts', filename, keys f = open(filename, 'rt') header = csv.readCsvLine(f) data = csv.readCsvGen(f) print header column_index = dict(zip(keys, [header.index(k) for k in keys])) counts = dict(zip(keys, [{} for k in keys])) print 'indexes', column_index print ' counts', counts num_lines = 0 for row in data: num_lines += 1 for k in keys: val = row[column_index[k]] counts[k][val] = counts[k].get(val,0) + 1 print filename, num_lines, 'lines' for k in keys: val = counts[k] print k, len(val), val total = sum(val.values()) cumulative = 0.0 if True: for v in sorted(val.keys(), key = lambda x: -val[x]): percent = val[v]*100.0/total cumulative += percent print '%5s %8d %3d%% %3d%%' % (v, val[v], round(percent), round(cumulative)) print '%5s %8d %3d%% %3d%%' % ('total', total, round(sum([v*100.0/total for v in val.values()])), round(cumulative)) f.close() return counts
def getAllStats(filename, keys, max_rows = sys.maxint): print 'getAllStats', filename, keys f = open(filename, 'rt') header = csv.readCsvLine(f) data = csv.readCsvGen(f) print header column_index = dict(zip(keys, [header.index(k) for k in keys])) stats = dict(zip(keys, [{'lo':sys.maxint, 'hi':-sys.maxint, 'mean': 0} for k in keys])) num_rows = 0 for row in data: for k in keys: val = float(row[column_index[k]]) s = stats[k] if stats[k]['lo'] > val: stats[k]['lo'] = val if stats[k]['hi'] < val: stats[k]['hi'] = val stats[k]['mean'] += val num_rows += 1 if num_rows > max_rows: break for k in keys: stats[k]['mean'] = stats[k]['mean']/num_rows print filename, num_rows, 'rows' for k in keys: print k, stats[k] f.close() return stats
def filterBadValues(in_filename, out_filename, keys): print 'filterBadValues', in_filename, out_filename, keys fin = open(in_filename, 'rt') fout = open(out_filename, 'wt') header = csv.readCsvLine(fin) csv.writeCsvRow(fout, header) print header data = csv.readCsvGen(fin) column_index = dict(zip(keys, [header.index(k) for k in keys])) num_rows = 0 num_bad = 0 for row in data: bad_row = False for k in keys: val = float(row[column_index[k]]) if val < 0: bad_row = True num_bad += 1 if not bad_row: csv.writeCsvRow(fout, row) num_rows += 1 fin.close() fout.close() print in_filename, num_rows, 'rows' print out_filename, num_rows - num_bad, 'rows'
def getStats(filename): f = open(filename, 'rt') header = csv.readCsvLine(f) lines = csv.readCsvGen(f) num_rows = 0 for l in lines: num_rows += 1 print filename, num_rows, 'lines' f.close()
def sampleCsv(in_filename, out_filename, ratio): """ Sample a csv file. """ print 'sampleCsv', in_filename, out_filename, ratio fin = open(in_filename, 'rt') fout = open(out_filename, 'wt') header = csv.readCsvLine(fin) print 'header:', header csv.writeCsvRow(fout, header) data = csv.readCsvGen(fin) num_sampled = 0 for irow,row in enumerate(data): if irow % 100000 == 0: print (irow,num_sampled), if num_sampled < ratio * irow: csv.writeCsvRow(fout, row) num_sampled += 1 print fin.close() fout.close() print in_filename, irow, 'rows' print out_filename, num_sampled, 'rows' if True: fin = open(out_filename, 'rt') header = csv.readCsvLine(fin) data = csv.readCsvGen(fin) for irow,row in enumerate(data): if len(row) != len(header): print irow, len(row), row, len(header), header assert(len(row) == len(header)) fin.close()
def populateHistogram(filename, histo, max_rows): print 'populateHistogram', histo.keys() f = open(filename, 'rU') header = csv.readCsvLine(f) data = csv.readCsvGen(f) column_index = dict(zip(header, [header.index(k) for k in header])) for i,row in enumerate(data): for k in histo.keys(): x = float(row[column_index[k]]) bin = binarySearch(histo[k]['levels'], x) histo[k]['counts'][bin] += 1 if i >= max_rows: break f.close() sales_histo = histo['sales'] for i,count in enumerate(sales_histo['counts']): print '%4d: %7d %8.2f %8.2f' % (i, count, sales_histo['levels'][i], sales_histo['levels'][i+1]-sales_histo['levels'][i]) assert(sales_histo['levels'][i+1] >= sales_histo['levels'][i])
def getHistogram(filename, keys, stats, max_rows = sys.maxint): """ Return a histogram of the form [(upper<i>, count<i>) for i=1..N] """ print 'getHistogram', filename, keys, stats # Max equal width bins num_bins = 10 histo = dict(zip(keys, [{'counts':[0 for i in range(num_bins)], 'levels':[stats[k]['lo'] + i *(stats[k]['hi']-stats[k]['lo']) for i in range(num_bins+1)]} for k in keys])) populateHistogram(filename, histo, max_rows) for num_bins in [20,40]: histo = makeHistogram(histo, num_bins) populateHistogram(filename, histo, max_rows) return histo for i,row in enumerate(data): for k in keys: val = float(row[column_index[k]]) bin = int((num_bins-1)*(val-stats[k]['lo'])/(stats[k]['hi']-stats[k]['lo'])) histo[k][bin] += 1 if i >= max_rows: break f.close() num_rows = i print 'read', num_rows, 'to make', num_bins, 'equal depth bins' for k in keys: print k, stats[k]['lo'], stats[k]['hi'], histo[k] num_equal = 10 equal_depth = dict(zip(keys, [[None for i in range(num_equal)] for k in keys])) # Make equal depth bins for k in keys: bin_num = 0 cumulative = 0 for i in range(num_equal): while cumulative/num_rows < i/num_equal: cumulative += histo[k][bin_num] bin_num += 1 equal_depth[k][i] = [cumulative, 0] # print ' ', i, bin_num, cumulative print 'bin_num', bin_num, ' len(histo)', len(histo[k]) assert(bin_num <= len(histo[k])-1) if False: f = open(filename, 'rU') header = csv.readCsvLine(f) data = csv.readCsvGen(f) for k in keys: print k, stats[k]['lo'], stats[k]['hi'], equal_depth[k] for i,row in enumerate(data): for k in ['sales']: # keys: val = float(row[column_index[k]]) bin_num = binarySearch(equal_depth[k], val) equal_depth[k][bin_num][1] += 1 print bin_num, val if i >= max_rows: break f.close() for k in keys: print '&&', k, len(equal_depth[k]) return equal_depth