def test_insert_value_into(): md = get_mock_table_md() gen = SQLGenerator(md) query = gen.insert_values_into( ["test", "test", "2020-12-08 20:03:16.759617"]) expected = "INSERT INTO test.test_table_delta VALUES ('test','test','2020-12-08 20:03:16.759617');" assert query.strip() == expected.strip()
def test_copy_expert_file_not_exists(): """Tests that when a file that does not exist is loaded (or tried to) then an exception is raised""" hook = PgHook() table_md_mock = table_metadata_mock() sql_gen = SQLGenerator(table_md=table_md_mock) with pytest.raises(FileNotFoundError): hook.copy_expert(query=sql_gen.copy_query(), src_path="/tmp/notexists.csv")
def test_create_table_query(): md = get_mock_table_md() gen = SQLGenerator(md) query = gen.create_table_query() expected = """ CREATE TABLE IF NOT EXISTS test.test_table_delta(id varchar(300),event_type varchar(100),event_ts timestamp); """ assert query.strip() == expected.strip()
def test_drop_table_query(): md = get_mock_table_md() gen = SQLGenerator(md) query = gen.drop_table() expected = """ DROP TABLE IF EXISTS test.test_table_delta; """ assert query.strip() == expected.strip()
def test_copy_query(): md = get_mock_table_md() gen = SQLGenerator(md) query = gen.copy_query() expected = """COPY test.test_table_delta (id,event_type,event_ts) FROM STDIN WITH DELIMITER ',' CSV HEADER """ # comparing exact match of string because spaces can cause unexpected assertion failures assert query.replace(" ", "") == expected.replace(" ", "")
def test_upsert_on_id_query(): md = get_mock_table_md() gen = SQLGenerator(md) query = gen.upsert_on_id() expected = """ CREATE TABLE IF NOT EXISTS test.test_table (LIKE test.test_table_delta); DELETE FROM test.test_table WHERE id IN (SELECT id FROM test.test_table_delta); INSERT INTO test.test_table SELECT * FROM test.test_table_delta; """ # comparing exact match of string because spaces can cause unexpected assertion failures assert query.replace(" ", "") == expected.replace(" ", "")
def __load_to_pgres_callback(self, ch, method, properties, body): data = json.loads(body) # encapsulating values inside single quotes for loading into the database row = [data[field] for field in self.fields] hook = PgHook() sql_gen = SQLGenerator(self.table_md) queries = [ sql_gen.create_table_query(), # create table if not exists for loading sql_gen.insert_values_into(values=row), ] hook.execute(queries) ch.basic_ack(delivery_tag=method.delivery_tag)
def load_to_table( self, src_path: str, table_md: Optional[TableMD] = None, table_md_path: Optional[str] = None, ) -> None: """Load data to a designated table using table metadata yaml file to construct the table. Only CSV format is valid. :param src_path: Path to the file to load into the database (singular file) :type src_path: str :param table_md: Parsed YAML file containing table metadata :type table_md: Optional[TableMD] :param table_md_path: Path to a table metadata YAML file :type table_md_path: Optional[str] """ if not table_md: table_md = TableMD(table_md_path=table_md_path) sql_generator = SQLGenerator(table_md=table_md) queries = [ sql_generator.drop_table(), sql_generator.create_table_query(), ] self.execute(queries) self.copy_expert(src_path=src_path, query=sql_generator.copy_query()) if table_md.delta_params: self.execute(sql_generator.upsert_on_id())
def test_copy_expert(): hook = PgHook() table_md_mock = table_metadata_mock() sql_gen = SQLGenerator(table_md=table_md_mock) header = ["name", "id"] row = ["sarah", "fz234kal"] input_data = [header, row] with NamedTemporaryFile(dir="/tmp", prefix=table_md_mock.load_prefix, mode="w+") as f: for row in input_data: f.write(",".join(row) + "\n") f.flush() hook.copy_expert(src_path=f.name, query=sql_gen.copy_query()) with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute( f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} WHERE name='sarah';" ) assert list(cur.fetchall()[0]) == row
def batch_load_to_pgres(self): """ Inactivity timeout is added in cases where there are less than 5 messages left in the queue. If no further messages arrive in 15 seconds, the inactivity timeout kicks in and triggers the processing of the batch currently stored in memory. """ sql_gen = SQLGenerator(self.table_md) hook = PgHook() hook.execute(sql_gen.create_table_query()) while True: try: connection, channel = self.__get_conn() batch = [] # Get five messages and break out. for method_frame, properties, body in channel.consume( queue=self.queue, inactivity_timeout=15 ): # if no more messages exist in the queue, break out of the loop if not method_frame: break data = json.loads(body) row = [data[field] for field in self.fields] batch.append(sql_gen.insert_values_into(values=row)) channel.basic_ack(method_frame.delivery_tag) if method_frame.delivery_tag == 5: break # Requeing the rest of the messages after having pulled a batch channel.cancel() print("processing batch") hook.execute(batch) # Close the channel and the connection safely when interrupting so we don't get hanging connections except KeyboardInterrupt: # safely channel.close() connection.close() raise
def main(): if len(sys.argv) != 4: print( 'Usage: ./measure.py [nvme | ssd | hdd] [config.json file] [greenplum | postgres]' ) sys.exit() disk = sys.argv[1] if disk not in ['nvme', 'ssd', 'hdd']: print( 'Usage: ./measure.py [nvme | ssd | hdd] [config.json file] [greenplum | postgres]' ) sys.exit() conf_file = sys.argv[2] config = read_config(conf_file) gen_datafiles(config) db = sys.argv[3] print db # csv_headers = ['bench', 'total_cold', 'uda_cold', 'compute_cold', # 'data_cold', 'io_cold', 'total_hot', 'uda_hot', # 'compute_hot', 'data_hot', 'io_hot'] csv_headers = [ 'bench', 'total_cold', 'uda_cold', 'compute_cold', 'data_cold', 'io_cold' ] csv_rows = [] # connection and cursor conn, cur = connect_to_database(db, measurement_dir, csv_rows, csv_headers) print('*' * 32) for i, cfg in enumerate(config): print('[Bench {:d}] Running benchmark {:s}: '.format(i, cfg['bench'])) sql = SQLGenerator(cfg, db) tablename = sql.tablename try: csv_row = {} csv_row['bench'] = tablename ############## # Cold Cache # ############## print('(1) Cold cache run...') # Copy data from datafile to postgres table if not table_exists(cur, sql.tablename): start = datetime.now() print('[{}] creating table ({})...'.format( str(start), sql.tablename)) print(sql.create_table()) cur.execute(sql.create_table()) if cfg['bench'] != 'lrmf': cur.execute( 'alter table {} alter column features set storage plain;' .format(sql.tablename)) elif cfg['bench'] == 'lrmf': # column val's storage should be plain? cur.execute( 'alter table {} alter column val set storage plain;'. format(sql.tablename)) conn.commit() stop = datetime.now() elapsed = stop - start print('[{}] done creating table. Elapsed: {}'.format( str(datetime.now()), elapsed.__str__())) with open(cfg['filename'], 'r') as f: print('[{}] copying data from datafile ({}) to table...'. format(str(datetime.now()), cfg['filename'])) if cfg['bench'] == 'lrmf': start = datetime.now() cur.copy_expert( "COPY " + tablename + " (row, col, val) FROM STDIN CSV", f) conn.commit() stop = datetime.now() elapsed = stop - start else: start = datetime.now() cur.copy_expert( "COPY " + tablename + " (y, features) FROM STDIN CSV", f) conn.commit() stop = datetime.now() elapsed = stop - start print('[{}] done copying data. Elapsed: {}'.format( str(datetime.now()), elapsed.__str__())) #continue # just to copy data # # The following actions are done only for cold cache. # # Must close connection before restarting. # disconnect_from_database(conn, cur) # print '[{}] After disconnect db, sleep for 5 sec...'.format(str(datetime.now())) # time.sleep(5) # print '[{}] done sleeping!'.format(str(datetime.now())) # restart_database(db) # print '[{}] After restart db, sleep for 5 sec...'.format(str(datetime.now())) # time.sleep(5) # print '[{}] done sleeping!'.format(str(datetime.now())) # conn, cur = connect_to_database(db, measurement_dir, csv_rows, csv_headers) # # OS buffer cache before cache flush # subprocess.call(['free', '-h']) # #flush_os_cache() # # buffer cache after flush # subprocess.call(['free', '-h']) # measure disk IO time io = float(measure_io(conn, cur, tablename, db)) csv_row['io_cold'] = io # run madlib start = datetime.now() print('[{}] Running madlib function {}()...'.format( str(start), madlib_func_lookup[sql.bench])) print sql.madlib_func() cur.execute(sql.madlib_func()) conn.commit() stop = datetime.now() elapsed = stop - start print('[{}] Done running madlib function. Elapsed: {}'.format( str(stop), elapsed.__str__())) if db == 'greenplum': exectime_str = cur.fetchall()[11][0] elif db == 'postgres': exectime_str = cur.fetchall()[2][0] exectime = float(exectime_str.split()[2]) # Get the uda, compute, and data time from duration.txt if db == 'greenplum': # if cfg['bench'] == 'lrmf': # lrmf only uses master node for some reason # transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt' # else: # transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0/trans.txt' #transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt' transfile = search_trans_file() if transfile is None: # safeguard transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt' with open(transfile, 'r') as f: lastline = f.read().splitlines()[-1].split(',') uda_cumulative = float(lastline[0]) / 1000.0 compute_cumulative = float(lastline[1]) / 1000.0 duration_file = search_duration_file() with open(duration_file, 'r') as f: durations = f.read().splitlines()[-1].split(',') udatime = float(durations[0]) / 1000.0 # us to ms computetime = float(durations[1]) / 1000.0 # us to ms elif db == 'postgres': #cmd = 'tail -1 /usr/local/pgsql/data/duration.txt' cmd = 'tail -1 /home/postgres/duration.txt' p = Popen(['sudo', 'su', '-c', cmd, 'postgres'], stdout=PIPE) lastline = p.stdout.read() durations = lastline.split(',') udatime = float(durations[0]) / 1000.0 # us to ms computetime = float(durations[1]) / 1000.0 # us to ms # delete trans.txt for setting 2 runs print '[INFO] Deleting trans.txt file...' Popen(['sudo', 'rm', '/usr/local/pgsql/data/trans.txt']) # greenplum is weird if db == 'greenplum': udatime += uda_cumulative computetime += compute_cumulative print('deleting duration.txt file...' + duration_file) os.remove(duration_file) # lrmf only runs for 1 epoch, so we need to multiply by # however many epochs it's supposed to run for # if cfg['bench'] == 'lrmf': # print '[DEBUG] lrmf epoch multiply: ' + str(sql.max_iter) # exectime *= sql.max_iter # udatime *= sql.max_iter # computetime *= sql.max_iter data_cold = exectime - (udatime + computetime) - io csv_row['total_cold'] = exectime csv_row['uda_cold'] = '{:.2f}'.format(udatime) csv_row['compute_cold'] = '{:.2f}'.format(computetime) csv_row['data_cold'] = '{:.2f}'.format(data_cold) print '' ############# # Hot Cache # ############# # print('(2) Hot cache run...') # # madlib complains if madlib output tables already exist # drop_madlib_tables(conn, cur, sql) # # measure disk IO time # io = float(measure_io(conn, cur, tablename, db)) # csv_row['io_hot'] = io # # run madlib # start = datetime.now() # print('[{}] Running madlib function {}()...'.format(str(start), madlib_func_lookup[sql.bench])) # cur.execute(sql.madlib_func()) # conn.commit() # stop = datetime.now() # elapsed = stop - start # print('[{}] Done running madlib function. Elapsed: {}'.format(str(stop), elapsed.__str__())) # if db == 'greenplum': # exectime_str = cur.fetchall()[11][0] # elif db == 'postgres': # exectime_str = cur.fetchall()[2][0] # exectime = float(exectime_str.split()[2]) # # Get the uda, compute, and data time from duration.txt # if db == 'greenplum': # if cfg['bench'] == 'lrmf': # lrmf only uses master node for some reason # transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt' # else: # transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0/trans.txt' # with open(transfile, 'r') as f: # lastline = f.read().splitlines()[-1].split(',') # uda_cumulative = float(lastline[0]) / 1000.0 # compute_cumulative = float(lastline[1]) / 1000.0 # duration_file = search_duration_file() # with open(duration_file, 'r') as f: # durations = f.read().splitlines()[-1].split(',') # udatime = float(durations[0]) / 1000.0 # us to ms # computetime = float(durations[1]) / 1000.0 # us to ms # elif db == 'postgres': # #cmd = 'tail -1 /usr/local/pgsql/data/duration.txt' # cmd = 'tail -1 /home/postgres/duration.txt' # p = Popen(['sudo', 'su', '-c', cmd, 'postgres'], stdout=PIPE) # lastline = p.stdout.read() # durations = lastline.split(',') # udatime = float(durations[0]) / 1000.0 # us to ms # computetime = float(durations[1]) / 1000.0 # us to ms # # delete trans.txt for setting 2 runs # print '[INFO] Deleting trans.txt file...' # Popen(['sudo', 'rm', '/usr/local/pgsql/data/trans.txt']) # # greenplum is weird # if db == 'greenplum': # udatime += uda_cumulative # computetime += compute_cumulative # print('deleting duration.txt file...' + duration_file) # print('') # os.remove(duration_file) # # lrmf only runs for 1 epoch, so we need to multiply by # # however many epochs it's supposed to run for # # if cfg['bench'] == 'lrmf': # # exectime *= sql.max_iter # # udatime *= sql.max_iter # # computetime *= sql.max_iter # data_hot = exectime - (udatime + computetime) - io # csv_row['total_hot'] = exectime # csv_row['uda_hot'] = '{:.2f}'.format(udatime) # csv_row['compute_hot'] = '{:.2f}'.format(computetime) # csv_row['data_hot'] = '{:.2f}'.format(data_hot) csv_rows.append(csv_row) drop_madlib_tables(conn, cur, sql) conn.commit() print('*' * 32) except psycopg2.Error as e: print("[EXCEPTION] unable to execute query") print(e.pgerror) filename = gen_filename(conn, cur) write_to_file(filename, measurement_dir, csv_rows, csv_headers) try: sys.exit(0) except SystemExit: os._exit(0) except KeyboardInterrupt: print('Keyboard interrupt') write_to_file(disk, measurement_dir, csv_rows, csv_headers) try: sys.exit(0) except SystemExit: os._exit(0) # finally: # csv_rows.append(csv_row) # # drop madlib and data tables # drop_madlib_tables(conn, cur, sql) # # cur.execute(sql.drop_table(sql.tablename)) # conn.commit() # print('*' * 32) # # csv_rows.append(csv_row) filename = gen_filename(conn, cur) write_to_file(filename, measurement_dir, csv_rows, csv_headers)
host='/tmp/', password='******') except psycopg2.Error as e: print("[EXCEPTION] unable to conenct to database") print(e.pgerror) exit() cur = conn.cursor() config = measure.read_config( '/home/joon/tabla.db/measurements/config_set2.json') csv_headers = ['bench', 'tablesize', 'pagecount'] csv_rows = [] for cfg in config: sql = SQLGenerator(cfg) cur.execute(sql.create_table()) csv_row = {} with open(cfg['filename'], 'r') as f: if cfg['bench'] == 'lrmf': cur.copy_expert( "COPY " + sql.tablename + " (row, col, val) FROM STDIN CSV", f) else: cur.execute( 'alter table {} alter column features set storage plain;'. format(sql.tablename)) cur.copy_expert( "COPY " + sql.tablename + " (y, features) FROM STDIN CSV", f)