def rescale_data_file(path): for f in prep.gen_file_list(path): if not f.endswith('.prescale'): continue print 'rescaling file: %s' % f fpath = f.rsplit('/', 1)[0] cols = prep.get_feature_columns(fpath + '/.columns') domains = prep.read_domains(cols, fpath + '/.prescale.domains') header = prep.get_header(fpath + '/.header') scaled_file = f.replace('.prescale', '.train') fin = open(f, 'r') fout = open(scaled_file, 'w') for line in fin: row = line.strip().split('\t') for c in cols: if prep.get_col_type(c, header) == 'num': min_val = float(domains[c]['min']) max_val = float(domains[c]['max']) new_val = rescale(float(row[c]), min_val, max_val, 1e6) # log_val = math.log(new_val + 1) row[c] = str(new_val) fout.write('\t'.join(row) + '\n') fin.close() fout.close()
def create_clustered_index(data_path, db_name, table_name, keys): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/') + 1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) # t_name = table_name + '_' + file_name # this change is for denomalization t_name = table_name print t_name, table_name col_names = ','.join(keys[table_name]) idx_name = t_name + '_' + col_names.replace(',', '_') cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating index %s %s' % (t_name, col_names) cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, col_names)) print 'clustering index %s' % idx_name cur.execute('CLUSTER %s USING %s' % (t_name, idx_name)) cur.close()
def convert_data_file(path): # get date mark # mindate = dt.date(1992,1,1).toordinal() # maxdate = dt.date(1998,12,31).toordinal() # weight = [10.0, 4.0, 3.0] # weight = [(99.0 * x + 1.0) for x in np.random.sample(5)] # date_mark = [] # acc = 0 # for x in weight: # acc += x # date_mark.append(acc/sum(weight) * (maxdate - mindate) + mindate) for f in prep.gen_file_list(path): if not f.endswith('.txt'): continue print 'converting file: %s' % f header_file = f.rsplit('/', 1)[0] + '/.header' header = prep.get_header(header_file) fin = open(f, 'r') fout = open(f.replace('.txt', '.prescale'), 'w') for line in fin: row = line.strip().split('\t') new_row = convert_row(row, header) fout.write('\t'.join(new_row) + '\n') fin.close() fout.close()
def create_table(data_path, table_name, db_name, mode): # connect to db conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() schema = create_schema(data_path.strip('/') + '/.header') if mode == 'individual' or mode == 'I': cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name) for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/') + 1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) t_name = table_name cur.execute("DROP TABLE IF EXISTS %s;" % t_name) cur.execute("CREATE TABLE %s (%s);" % (t_name, schema)) cur.execute("COPY %s FROM '%s';" % (t_name, full_path + '/' + file_name)) elif mode == 'partitioned' or mode == 'P': # create master table cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name) cur.execute("CREATE TABLE %s (%s);" % (table_name, schema)) for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/') + 1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) t_name = table_name + '_' + file_name print t_name, table_name cur.execute("DROP TABLE IF EXISTS %s;" % t_name) cur.execute("CREATE TABLE %s () INHERITS (%s);" % (t_name, table_name)) cur.execute("COPY %s FROM '%s' ;" % (t_name, full_path + '/' + file_name)) else: print 'unknown mode: %s' % mode sys.exit(1) cur.close() conn.close()
def classify_data(k, cols, model, path): for f in prep.gen_file_list(path): if f.endswith('.train'): print 'classifying %s' % f fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w') labels = assign_labels(model, f, cols) fw.write('\n'.join(str(x) for x in labels) + '\n') fw.close()
def create_table(data_path, table_name, db_name, mode): # connect to db conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() schema = create_schema(data_path.strip('/') + '/.header') if mode == 'individual' or mode == 'I': cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name) for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/')+1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) t_name = table_name cur.execute("DROP TABLE IF EXISTS %s;" % t_name) cur.execute("CREATE TABLE %s (%s);" % (t_name, schema)) cur.execute("COPY %s FROM '%s';" % (t_name, full_path + '/' + file_name)) elif mode == 'partitioned' or mode == 'P': # create master table cur.execute("DROP TABLE IF EXISTS %s CASCADE;" % table_name) cur.execute("CREATE TABLE %s (%s);" % (table_name, schema)) for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/')+1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) t_name = table_name + '_' + file_name print t_name, table_name cur.execute("DROP TABLE IF EXISTS %s;" % t_name) cur.execute("CREATE TABLE %s () INHERITS (%s);" % (t_name, table_name)) cur.execute("COPY %s FROM '%s' ;" % (t_name, full_path + '/' + file_name)) else: print 'unknown mode: %s' % mode sys.exit(1) cur.close() conn.close()
def classify_data_kmeans(k, cols, path, centers): for f in prep.gen_file_list(path): if f.endswith('.train'): print 'classifying %s' % f fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w') prog = 0 for row in prep.gen_file_stream(f, cols): if prog % 10000 == 0: print 'progress: %d' % prog label = assign_center(row, centers) fw.write(str(label) + '\n') prog += 1 fw.close()
def sample_train_files(in_path): for f in prep.gen_file_list(in_path): if not f.endswith('.train'): continue print 'sampling %s' % f fpath = f.rsplit('/', 1)[0] sample_ratio = int(open(fpath + '/.ratio').read()) fr = open(f, 'r') fw = open('%s.%d.sample' % (f, sample_ratio), 'w') for line in fr: if np.random.randint(sample_ratio) == 0: fw.write(line) fr.close() fw.close()
def gen_labels_shipdate(in_path, k): mindate = dt.date(1992,1,1).toordinal() maxdate = dt.date(1998,12,31).toordinal() for f in prep.gen_file_list(in_path): if f.endswith('.txt'): path = f[:f.rfind('/')] # line_count = sum(1 for line in open(f, 'r')) # part_len = line_count / k fr = open(f, 'r') fw = open('%s/.%d.%s.labels' % (path, k, 'range'), 'w') print fw for i, line in enumerate(fr): date = line.split('\t')[11].split('-') val = dt.date(int(date[0]), int(date[1]), int(date[2])).toordinal() label = (val-mindate) * k / (maxdate-mindate) # print date, val, label fw.write(str(int(label))+'\n') fr.close() fw.close()
def create_unclustered_index(data_path, db_name, table_name, mode='P'): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/') + 1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) if mode == 'P': t_name = table_name + '_' + file_name elif mode == 'I': t_name = table_name else: print 'unrecognized mode: %s' % mode sys.exit(1) print t_name # if not os.path.exists(cur_path + '/.k'): # k = -1 # else: # k = int(open(cur_path + '/.k').read()) # if k == 1: # continue # cols = prep.get_feature_columns(cur_path + '/.columns') header = prep.get_header(cur_path + '/.header') cols = range(len(header))[1:] for col in cols: field_name = header[col][0] if 'comment' in field_name: continue idx_name = '%s_%s_%s' % (t_name, str(col), field_name) cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating (unclustered) index %s' % idx_name cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name)) cur.close()
def gen_labels_shipdate(in_path, k): mindate = dt.date(1992, 1, 1).toordinal() maxdate = dt.date(1998, 12, 31).toordinal() for f in prep.gen_file_list(in_path): if f.endswith('.txt'): path = f[:f.rfind('/')] # line_count = sum(1 for line in open(f, 'r')) # part_len = line_count / k fr = open(f, 'r') fw = open('%s/.%d.%s.labels' % (path, k, 'range'), 'w') print fw for i, line in enumerate(fr): date = line.split('\t')[11].split('-') val = dt.date(int(date[0]), int(date[1]), int(date[2])).toordinal() label = (val - mindate) * k / (maxdate - mindate) # print date, val, label fw.write(str(int(label)) + '\n') fr.close() fw.close()
def gen_labels(in_path, mode): for f in prep.gen_file_list(in_path): if f.endswith('.txt'): path = f[:f.rfind('/')] line_count = sum(1 for line in open(f, 'r')) k = int(open(path + '/.k').read()) part_len = line_count / k fr = open(f, 'r') fw = open('%s/.%d.%s.labels' % (path, k, mode), 'w') print fw for i, line in enumerate(fr): if mode == 'random': label = random.randint(0, int(k) - 1) elif mode == 'keyrange': label = i / part_len else: print 'not implemented for %s' % mode return fw.write(str(label) + '\n') fr.close() fw.close()
def gen_labels(in_path, mode): for f in prep.gen_file_list(in_path): if f.endswith('.txt'): path = f[:f.rfind('/')] line_count = sum(1 for line in open(f, 'r')) k = int(open(path+'/.k').read()) part_len = line_count / k fr = open(f, 'r') fw = open('%s/.%d.%s.labels' % (path, k, mode), 'w') print fw for i, line in enumerate(fr): if mode == 'random': label = random.randint(0, int(k)-1) elif mode == 'keyrange': label = i / part_len else: print 'not implemented for %s' % mode return fw.write(str(label)+'\n') fr.close() fw.close()
def create_unclustered_index(data_path, db_name, table_name, mode = 'P'): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/')+1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) if mode == 'P': t_name = table_name + '_' + file_name elif mode == 'I': t_name = table_name else: print 'unrecognized mode: %s' % mode sys.exit(1) print t_name # if not os.path.exists(cur_path + '/.k'): # k = -1 # else: # k = int(open(cur_path + '/.k').read()) # if k == 1: # continue # cols = prep.get_feature_columns(cur_path + '/.columns') header = prep.get_header(cur_path + '/.header') cols = range(len(header))[1:] for col in cols: field_name = header[col][0] if 'comment' in field_name: continue idx_name = '%s_%s_%s' % (t_name, str(col), field_name) cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating (unclustered) index %s' % idx_name cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, field_name)) cur.close()
def create_clustered_index(data_path, db_name, table_name, keys): conn = psycopg2.connect("dbname=%s port=11111" % db_name) conn.set_isolation_level(0) cur = conn.cursor() for f in prep.gen_file_list(data_path): file_name = f[f.rfind('/')+1:] cur_path = f[:f.rfind('/')] full_path = os.path.abspath(cur_path) # t_name = table_name + '_' + file_name # this change is for denomalization t_name = table_name print t_name, table_name col_names = ','.join(keys[table_name]) idx_name = t_name + '_' + col_names.replace(',', '_') cur.execute('DROP INDEX IF EXISTS %s' % idx_name) print 'creating index %s %s' % (t_name, col_names) cur.execute('CREATE INDEX %s ON %s (%s)' % (idx_name, t_name, col_names)) print 'clustering index %s' % idx_name cur.execute('CLUSTER %s USING %s' % (t_name, idx_name)) cur.close()
def meta_count(file_path): for f in prep.gen_file_list(file_path): count_values(f)
def shuffle_data(in_path, out_path, mode='copy'): out_path = out_path.strip('/') if not os.path.exists(out_path): os.mkdir(out_path) files = {} for f in prep.gen_file_list(in_path): if not f.endswith('.txt'): continue fpath = f[:f.rfind('/')] fout_path = out_path + '/' + mode if not os.path.exists(fout_path): os.makedirs(fout_path) print 'shuffling file %s into %s' % (f, fout_path) if not os.path.exists(fpath + '/.header'): print 'header file missing: %s' % fpath + '/.header' sys.exit(1) shu.copy(fpath + '/.header', fout_path) if mode == 'copy': shu.copy(f, fout_path) continue k = int(open(fpath + '/.k', 'r').read()) shu.copy(fpath + '/.k', fout_path) if os.path.exists(fpath + '/.columns'): shu.copy(fpath + '/.columns', fout_path) if k == 1: shu.copy(f, fout_path + '/whole') continue if mode == 'learn': flabel = '%s/.%s.labels' % (fpath, k) else: flabel = '%s/.%s.%s.labels' % (fpath, k, mode) print flabel if not os.path.exists(flabel): if mode == 'learn': print 'ERROR: no label file found for %s' % f sys.exit(1) else: gen_random_labels(in_path, k, mode) fd = open(f, 'r') fl = open(flabel, 'r') for line in fd: label = fl.next().strip() if label not in files: fw = open('%s/%s' % (fout_path, label), 'w') files[label] = fw files[label].write(line) for w in files.values(): w.close()
def shuffle_data(in_path, out_path, mode = 'copy'): out_path = out_path.strip('/') if not os.path.exists(out_path): os.mkdir(out_path) files = {} for f in prep.gen_file_list(in_path): if not f.endswith('.txt'): continue fpath = f[:f.rfind('/')] fout_path = out_path + '/' + mode if not os.path.exists(fout_path): os.makedirs(fout_path) print 'shuffling file %s into %s' % (f, fout_path) if not os.path.exists(fpath + '/.header'): print 'header file missing: %s' % fpath + '/.header' sys.exit(1) shu.copy(fpath + '/.header', fout_path) if mode == 'copy': shu.copy(f, fout_path) continue k = int(open(fpath + '/.k', 'r').read()) shu.copy(fpath + '/.k', fout_path) if os.path.exists(fpath + '/.columns'): shu.copy(fpath + '/.columns', fout_path) if k == 1: shu.copy(f, fout_path + '/whole') continue if mode == 'learn': flabel = '%s/.%s.labels' % (fpath, k) else: flabel = '%s/.%s.%s.labels' % (fpath, k, mode) print flabel if not os.path.exists(flabel): if mode == 'learn': print 'ERROR: no label file found for %s' % f sys.exit(1) else: gen_random_labels(in_path, k, mode) fd = open(f, 'r') fl = open(flabel, 'r') for line in fd: label = fl.next().strip() if label not in files: fw = open('%s/%s' % (fout_path, label), 'w') files[label] = fw files[label].write(line) for w in files.values(): w.close()