def gen_query(): [header, data] = discretize.load_file(sys.argv[1]) gen_queries_uniform_row(data, sys.argv[2]+"row", int(sys.argv[3])) gen_queries_uniform_key(data, sys.argv[2]+"key", int(sys.argv[3]))
def gen_query(): [header, data] = discretize.load_file(sys.argv[1]) gen_queries_uniform_row(data, sys.argv[2] + "row", int(sys.argv[3])) gen_queries_uniform_key(data, sys.argv[2] + "key", int(sys.argv[3]))
def create_partitioned_db(file_name, force_split): # split this big into separate files print 'populating data to db' fname = ROOT_PATH + RAWDATA_PATH + file_name [header, data] = discretize.load_file(fname) labels = list(set(data[:,-1])) force_name = ("_sp_" + str(force_split)) if force_split > 0 else "" data_dir = file_name + force_name + ".splits" print data_dir if os.path.exists(data_dir): print "seems the directory: %s exists already..." % data_dir print "skipping this step" else: os.mkdir(data_dir) if force_split > 0: interval = int(math.ceil(data.shape[0] * 1.0 / force_split)) for i in range(force_split): subset = np.array(data[i*interval:min((i+1)*interval, data.shape[0]), :]) print subset.shape fw = open(data_dir + "/" + file_name + "_" + str(i), 'w') for row in subset: if len(row) != len(DB_SCHEMA.split(",")): print "unmatched length!" fw.write(','.join(row) + '\n') fw.close() else: print 'partition based on labels' for x in labels: print 'creating partition %s' % x # subset = np.array(list(list(y) for y in data if y[-1] == x)) print 'created subset' fw = open(data_dir + "/" + file_name + "_" + str(x), 'w') print 'opened file' for row in data: if row[-1] != x: continue if len(row) != len(DB_SCHEMA.split(",")): print "unmatched length!" fw.write(','.join(row) + '\n') fw.close() # load each file into db as a table dbname = file_name + force_name if not execute_status('pg_ctl start -D %s -l pg.log -o "-p 11111"' % PGDATA_PATH): print "databaes failed to start..." sys.exit(1) sleep(2) conn = psycopg2.connect("dbname=postgres port=11111") conn.set_isolation_level(0) # psql_proc = subprocess.Popen([POSTGRES_BIN_PATH+"psql", "-p 11111", "postgres"], # stdin=subprocess.PIPE, # stdout=subprocess.PIPE, # universal_newlines=True) cur = conn.cursor() cur.execute("DROP DATABASE IF EXISTS %s;" % dbname) cur.execute("CREATE DATABASE %s;" % dbname) cur.close() conn.close() conn = psycopg2.connect("dbname=%s port=11111" % dbname) conn.set_isolation_level(0) cur = conn.cursor() tables = os.listdir(data_dir) # create master table master_table = dbname cur.execute("DROP TABLE IF EXISTS %s;" % master_table) cur.execute("CREATE TABLE %s %s;" % (master_table, DB_SCHEMA)) for data_file in tables: cur.execute("DROP TABLE IF EXISTS %s;" % data_file) cur.execute("CREATE TABLE %s () INHERITS (%s);" % (data_file, master_table)) cur.execute("COPY %s FROM '%s' WITH (FORMAT CSV);" % (data_file, ROOT_PATH + RAWDATA_PATH + data_dir + "/" + data_file)) cur.close() conn.close() if not execute_status('pg_ctl stop -D %s' % PGDATA_PATH): print "databaes failed to stop..." sys.exit(1) return [header, data, dbname, tables]
def create_partitioned_db(file_name, force_split): # split this big into separate files print 'populating data to db' fname = ROOT_PATH + RAWDATA_PATH + file_name [header, data] = discretize.load_file(fname) labels = list(set(data[:, -1])) force_name = ("_sp_" + str(force_split)) if force_split > 0 else "" data_dir = file_name + force_name + ".splits" print data_dir if os.path.exists(data_dir): print "seems the directory: %s exists already..." % data_dir print "skipping this step" else: os.mkdir(data_dir) if force_split > 0: interval = int(math.ceil(data.shape[0] * 1.0 / force_split)) for i in range(force_split): subset = np.array( data[i * interval:min((i + 1) * interval, data.shape[0]), :]) print subset.shape fw = open(data_dir + "/" + file_name + "_" + str(i), 'w') for row in subset: if len(row) != len(DB_SCHEMA.split(",")): print "unmatched length!" fw.write(','.join(row) + '\n') fw.close() else: print 'partition based on labels' for x in labels: print 'creating partition %s' % x # subset = np.array(list(list(y) for y in data if y[-1] == x)) print 'created subset' fw = open(data_dir + "/" + file_name + "_" + str(x), 'w') print 'opened file' for row in data: if row[-1] != x: continue if len(row) != len(DB_SCHEMA.split(",")): print "unmatched length!" fw.write(','.join(row) + '\n') fw.close() # load each file into db as a table dbname = file_name + force_name if not execute_status( 'pg_ctl start -D %s -l pg.log -o "-p 11111"' % PGDATA_PATH): print "databaes failed to start..." sys.exit(1) sleep(2) conn = psycopg2.connect("dbname=postgres port=11111") conn.set_isolation_level(0) # psql_proc = subprocess.Popen([POSTGRES_BIN_PATH+"psql", "-p 11111", "postgres"], # stdin=subprocess.PIPE, # stdout=subprocess.PIPE, # universal_newlines=True) cur = conn.cursor() cur.execute("DROP DATABASE IF EXISTS %s;" % dbname) cur.execute("CREATE DATABASE %s;" % dbname) cur.close() conn.close() conn = psycopg2.connect("dbname=%s port=11111" % dbname) conn.set_isolation_level(0) cur = conn.cursor() tables = os.listdir(data_dir) # create master table master_table = dbname cur.execute("DROP TABLE IF EXISTS %s;" % master_table) cur.execute("CREATE TABLE %s %s;" % (master_table, DB_SCHEMA)) for data_file in tables: cur.execute("DROP TABLE IF EXISTS %s;" % data_file) cur.execute("CREATE TABLE %s () INHERITS (%s);" % (data_file, master_table)) cur.execute( "COPY %s FROM '%s' WITH (FORMAT CSV);" % (data_file, ROOT_PATH + RAWDATA_PATH + data_dir + "/" + data_file)) cur.close() conn.close() if not execute_status('pg_ctl stop -D %s' % PGDATA_PATH): print "databaes failed to stop..." sys.exit(1) return [header, data, dbname, tables]