Example #1
0
def test_insert_value_into():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.insert_values_into(
        ["test", "test", "2020-12-08 20:03:16.759617"])
    expected = "INSERT INTO test.test_table_delta VALUES ('test','test','2020-12-08 20:03:16.759617');"
    assert query.strip() == expected.strip()
Example #2
0
def test_copy_expert_file_not_exists():
    """Tests that when a file that does not exist is loaded (or tried to) then an exception is raised"""
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    sql_gen = SQLGenerator(table_md=table_md_mock)
    with pytest.raises(FileNotFoundError):
        hook.copy_expert(query=sql_gen.copy_query(),
                         src_path="/tmp/notexists.csv")
Example #3
0
def test_create_table_query():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.create_table_query()
    expected = """
CREATE TABLE IF NOT EXISTS test.test_table_delta(id varchar(300),event_type varchar(100),event_ts timestamp);
    """
    assert query.strip() == expected.strip()
Example #4
0
def test_drop_table_query():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.drop_table()
    expected = """
    DROP TABLE IF EXISTS test.test_table_delta;
    """
    assert query.strip() == expected.strip()
Example #5
0
def test_copy_query():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.copy_query()
    expected = """COPY test.test_table_delta (id,event_type,event_ts) FROM STDIN
    WITH
    DELIMITER ','
    CSV HEADER
    """
    # comparing exact match of string because spaces can cause unexpected assertion failures
    assert query.replace(" ", "") == expected.replace(" ", "")
Example #6
0
def test_upsert_on_id_query():
    md = get_mock_table_md()
    gen = SQLGenerator(md)
    query = gen.upsert_on_id()
    expected = """
    CREATE TABLE IF NOT EXISTS test.test_table (LIKE test.test_table_delta);
    DELETE FROM test.test_table WHERE id
    IN (SELECT id FROM test.test_table_delta);
    INSERT INTO test.test_table SELECT * FROM test.test_table_delta;
"""
    # comparing exact match of string because spaces can cause unexpected assertion failures
    assert query.replace(" ", "") == expected.replace(" ", "")
Example #7
0
 def __load_to_pgres_callback(self, ch, method, properties, body):
     data = json.loads(body)
     # encapsulating values inside single quotes for loading into the database
     row = [data[field] for field in self.fields]
     hook = PgHook()
     sql_gen = SQLGenerator(self.table_md)
     queries = [
         sql_gen.create_table_query(),  # create table if not exists for loading
         sql_gen.insert_values_into(values=row),
     ]
     hook.execute(queries)
     ch.basic_ack(delivery_tag=method.delivery_tag)
Example #8
0
    def load_to_table(
        self,
        src_path: str,
        table_md: Optional[TableMD] = None,
        table_md_path: Optional[str] = None,
    ) -> None:
        """Load data to a designated table using table metadata yaml file to construct the table. Only CSV format
        is valid.
        :param src_path: Path to the file to load into the database (singular file)
        :type src_path: str
        :param table_md: Parsed YAML file containing table metadata
        :type table_md: Optional[TableMD]
        :param table_md_path: Path to a table metadata YAML file
        :type table_md_path: Optional[str]
        """
        if not table_md:
            table_md = TableMD(table_md_path=table_md_path)
        sql_generator = SQLGenerator(table_md=table_md)
        queries = [
            sql_generator.drop_table(),
            sql_generator.create_table_query(),
        ]
        self.execute(queries)

        self.copy_expert(src_path=src_path, query=sql_generator.copy_query())

        if table_md.delta_params:
            self.execute(sql_generator.upsert_on_id())
Example #9
0
def test_copy_expert():
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    sql_gen = SQLGenerator(table_md=table_md_mock)
    header = ["name", "id"]
    row = ["sarah", "fz234kal"]
    input_data = [header, row]

    with NamedTemporaryFile(dir="/tmp",
                            prefix=table_md_mock.load_prefix,
                            mode="w+") as f:
        for row in input_data:
            f.write(",".join(row) + "\n")
            f.flush()
        hook.copy_expert(src_path=f.name, query=sql_gen.copy_query())

    with hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} WHERE name='sarah';"
            )
            assert list(cur.fetchall()[0]) == row
Example #10
0
    def batch_load_to_pgres(self):
        """
        Inactivity timeout is added in cases where there are less than 5 messages left in the queue. If no further messages
        arrive in 15 seconds, the inactivity timeout kicks in and triggers the processing of the batch currently
        stored in memory.
        """
        sql_gen = SQLGenerator(self.table_md)
        hook = PgHook()
        hook.execute(sql_gen.create_table_query())

        while True:
            try:
                connection, channel = self.__get_conn()
                batch = []
                # Get five messages and break out.
                for method_frame, properties, body in channel.consume(
                    queue=self.queue, inactivity_timeout=15
                ):

                    # if no more messages exist in the queue, break out of the loop
                    if not method_frame:
                        break
                    data = json.loads(body)
                    row = [data[field] for field in self.fields]
                    batch.append(sql_gen.insert_values_into(values=row))
                    channel.basic_ack(method_frame.delivery_tag)

                    if method_frame.delivery_tag == 5:
                        break
                # Requeing the rest of the messages after having pulled a batch
                channel.cancel()
                print("processing batch")
                hook.execute(batch)

            # Close the channel and the connection safely when interrupting so we don't get hanging connections
            except KeyboardInterrupt:  # safely
                channel.close()
                connection.close()
                raise
Example #11
0
def main():
    if len(sys.argv) != 4:
        print(
            'Usage: ./measure.py [nvme | ssd | hdd] [config.json file] [greenplum | postgres]'
        )
        sys.exit()

    disk = sys.argv[1]
    if disk not in ['nvme', 'ssd', 'hdd']:
        print(
            'Usage: ./measure.py [nvme | ssd | hdd] [config.json file] [greenplum | postgres]'
        )
        sys.exit()

    conf_file = sys.argv[2]
    config = read_config(conf_file)
    gen_datafiles(config)

    db = sys.argv[3]
    print db

    # csv_headers = ['bench', 'total_cold', 'uda_cold', 'compute_cold',
    #                'data_cold', 'io_cold', 'total_hot', 'uda_hot',
    #                'compute_hot', 'data_hot', 'io_hot']
    csv_headers = [
        'bench', 'total_cold', 'uda_cold', 'compute_cold', 'data_cold',
        'io_cold'
    ]
    csv_rows = []

    # connection and cursor
    conn, cur = connect_to_database(db, measurement_dir, csv_rows, csv_headers)

    print('*' * 32)
    for i, cfg in enumerate(config):
        print('[Bench {:d}] Running benchmark {:s}: '.format(i, cfg['bench']))
        sql = SQLGenerator(cfg, db)
        tablename = sql.tablename
        try:
            csv_row = {}
            csv_row['bench'] = tablename

            ##############
            # Cold Cache #
            ##############
            print('(1) Cold cache run...')

            # Copy data from datafile to postgres table
            if not table_exists(cur, sql.tablename):
                start = datetime.now()
                print('[{}] creating table ({})...'.format(
                    str(start), sql.tablename))
                print(sql.create_table())
                cur.execute(sql.create_table())
                if cfg['bench'] != 'lrmf':
                    cur.execute(
                        'alter table {} alter column features set storage plain;'
                        .format(sql.tablename))
                elif cfg['bench'] == 'lrmf':
                    # column val's storage should be plain?
                    cur.execute(
                        'alter table {} alter column val set storage plain;'.
                        format(sql.tablename))
                conn.commit()
                stop = datetime.now()
                elapsed = stop - start
                print('[{}] done creating table. Elapsed: {}'.format(
                    str(datetime.now()), elapsed.__str__()))
                with open(cfg['filename'], 'r') as f:
                    print('[{}] copying data from datafile ({}) to table...'.
                          format(str(datetime.now()), cfg['filename']))
                    if cfg['bench'] == 'lrmf':
                        start = datetime.now()
                        cur.copy_expert(
                            "COPY " + tablename +
                            " (row, col, val) FROM STDIN CSV", f)
                        conn.commit()
                        stop = datetime.now()
                        elapsed = stop - start
                    else:
                        start = datetime.now()
                        cur.copy_expert(
                            "COPY " + tablename +
                            " (y, features) FROM STDIN CSV", f)
                        conn.commit()
                        stop = datetime.now()
                        elapsed = stop - start
                    print('[{}] done copying data. Elapsed: {}'.format(
                        str(datetime.now()), elapsed.__str__()))

            #continue # just to copy data

            # # The following actions are done only for cold cache.
            # # Must close connection before restarting.
            # disconnect_from_database(conn, cur)
            # print '[{}] After disconnect db, sleep for 5 sec...'.format(str(datetime.now()))
            # time.sleep(5)
            # print '[{}] done sleeping!'.format(str(datetime.now()))
            # restart_database(db)
            # print '[{}] After restart db, sleep for 5 sec...'.format(str(datetime.now()))
            # time.sleep(5)
            # print '[{}] done sleeping!'.format(str(datetime.now()))
            # conn, cur = connect_to_database(db, measurement_dir, csv_rows, csv_headers)

            # # OS buffer cache before cache flush
            # subprocess.call(['free', '-h'])
            # #flush_os_cache()
            # # buffer cache after flush
            # subprocess.call(['free', '-h'])

            # measure disk IO time
            io = float(measure_io(conn, cur, tablename, db))
            csv_row['io_cold'] = io

            # run madlib
            start = datetime.now()
            print('[{}] Running madlib function {}()...'.format(
                str(start), madlib_func_lookup[sql.bench]))
            print sql.madlib_func()
            cur.execute(sql.madlib_func())
            conn.commit()
            stop = datetime.now()
            elapsed = stop - start
            print('[{}] Done running madlib function. Elapsed: {}'.format(
                str(stop), elapsed.__str__()))
            if db == 'greenplum':
                exectime_str = cur.fetchall()[11][0]
            elif db == 'postgres':
                exectime_str = cur.fetchall()[2][0]
            exectime = float(exectime_str.split()[2])

            # Get the uda, compute, and data time from duration.txt
            if db == 'greenplum':
                # if cfg['bench'] == 'lrmf':  # lrmf only uses master node for some reason
                #     transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt'
                # else:
                #     transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0/trans.txt'
                #transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt'
                transfile = search_trans_file()
                if transfile is None:  # safeguard
                    transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt'
                with open(transfile, 'r') as f:
                    lastline = f.read().splitlines()[-1].split(',')
                    uda_cumulative = float(lastline[0]) / 1000.0
                    compute_cumulative = float(lastline[1]) / 1000.0
                duration_file = search_duration_file()
                with open(duration_file, 'r') as f:
                    durations = f.read().splitlines()[-1].split(',')
                    udatime = float(durations[0]) / 1000.0  # us to ms
                    computetime = float(durations[1]) / 1000.0  # us to ms
            elif db == 'postgres':
                #cmd = 'tail -1 /usr/local/pgsql/data/duration.txt'
                cmd = 'tail -1 /home/postgres/duration.txt'
                p = Popen(['sudo', 'su', '-c', cmd, 'postgres'], stdout=PIPE)
                lastline = p.stdout.read()
                durations = lastline.split(',')
                udatime = float(durations[0]) / 1000.0  # us to ms
                computetime = float(durations[1]) / 1000.0  # us to ms
                # delete trans.txt for setting 2 runs
                print '[INFO] Deleting trans.txt file...'
                Popen(['sudo', 'rm', '/usr/local/pgsql/data/trans.txt'])

            # greenplum is weird
            if db == 'greenplum':
                udatime += uda_cumulative
                computetime += compute_cumulative
                print('deleting duration.txt file...' + duration_file)
                os.remove(duration_file)

            # lrmf only runs for 1 epoch, so we need to multiply by
            # however many epochs it's supposed to run for
            # if cfg['bench'] == 'lrmf':
            #     print '[DEBUG] lrmf epoch multiply: ' + str(sql.max_iter)
            #     exectime *= sql.max_iter
            #     udatime *= sql.max_iter
            #     computetime *= sql.max_iter

            data_cold = exectime - (udatime + computetime) - io
            csv_row['total_cold'] = exectime
            csv_row['uda_cold'] = '{:.2f}'.format(udatime)
            csv_row['compute_cold'] = '{:.2f}'.format(computetime)
            csv_row['data_cold'] = '{:.2f}'.format(data_cold)
            print ''

            #############
            # Hot Cache #
            #############
            # print('(2) Hot cache run...')
            # # madlib complains if madlib output tables already exist
            # drop_madlib_tables(conn, cur, sql)

            # # measure disk IO time
            # io = float(measure_io(conn, cur, tablename, db))
            # csv_row['io_hot'] = io

            # # run madlib
            # start = datetime.now()
            # print('[{}] Running madlib function {}()...'.format(str(start), madlib_func_lookup[sql.bench]))
            # cur.execute(sql.madlib_func())
            # conn.commit()
            # stop = datetime.now()
            # elapsed = stop - start
            # print('[{}] Done running madlib function. Elapsed: {}'.format(str(stop), elapsed.__str__()))

            # if db == 'greenplum':
            #     exectime_str = cur.fetchall()[11][0]
            # elif db == 'postgres':
            #     exectime_str = cur.fetchall()[2][0]
            # exectime = float(exectime_str.split()[2])

            # # Get the uda, compute, and data time from duration.txt
            # if db == 'greenplum':
            #     if cfg['bench'] == 'lrmf':  # lrmf only uses master node for some reason
            #         transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/trans.txt'
            #     else:
            #         transfile = '/home/joon/gpdb-5.1.0/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0/trans.txt'
            #     with open(transfile, 'r') as f:
            #         lastline = f.read().splitlines()[-1].split(',')
            #         uda_cumulative = float(lastline[0]) / 1000.0
            #         compute_cumulative = float(lastline[1]) / 1000.0
            #     duration_file = search_duration_file()
            #     with open(duration_file, 'r') as f:
            #         durations = f.read().splitlines()[-1].split(',')
            #         udatime = float(durations[0]) / 1000.0  # us to ms
            #         computetime = float(durations[1]) / 1000.0  # us to ms
            # elif db == 'postgres':
            #     #cmd = 'tail -1 /usr/local/pgsql/data/duration.txt'
            #     cmd = 'tail -1 /home/postgres/duration.txt'
            #     p = Popen(['sudo', 'su', '-c', cmd, 'postgres'], stdout=PIPE)
            #     lastline = p.stdout.read()
            #     durations = lastline.split(',')
            #     udatime = float(durations[0]) / 1000.0  # us to ms
            #     computetime = float(durations[1]) / 1000.0  # us to ms
            #     # delete trans.txt for setting 2 runs
            #     print '[INFO] Deleting trans.txt file...'
            #     Popen(['sudo', 'rm', '/usr/local/pgsql/data/trans.txt'])

            # # greenplum is weird
            # if db == 'greenplum':
            #     udatime += uda_cumulative
            #     computetime += compute_cumulative
            #     print('deleting duration.txt file...' + duration_file)
            #     print('')
            #     os.remove(duration_file)

            # # lrmf only runs for 1 epoch, so we need to multiply by
            # # however many epochs it's supposed to run for
            # # if cfg['bench'] == 'lrmf':
            # #     exectime *= sql.max_iter
            # #     udatime *= sql.max_iter
            # #     computetime *= sql.max_iter

            # data_hot = exectime - (udatime + computetime) - io
            # csv_row['total_hot'] = exectime
            # csv_row['uda_hot'] = '{:.2f}'.format(udatime)
            # csv_row['compute_hot'] = '{:.2f}'.format(computetime)
            # csv_row['data_hot'] = '{:.2f}'.format(data_hot)

            csv_rows.append(csv_row)
            drop_madlib_tables(conn, cur, sql)
            conn.commit()
            print('*' * 32)
        except psycopg2.Error as e:
            print("[EXCEPTION] unable to execute query")
            print(e.pgerror)
            filename = gen_filename(conn, cur)
            write_to_file(filename, measurement_dir, csv_rows, csv_headers)
            try:
                sys.exit(0)
            except SystemExit:
                os._exit(0)
        except KeyboardInterrupt:
            print('Keyboard interrupt')
            write_to_file(disk, measurement_dir, csv_rows, csv_headers)
            try:
                sys.exit(0)
            except SystemExit:
                os._exit(0)
        # finally:
        #     csv_rows.append(csv_row)
        #     # drop madlib and data tables
        #     drop_madlib_tables(conn, cur, sql)
        #     # cur.execute(sql.drop_table(sql.tablename))
        #     conn.commit()
        #     print('*' * 32)
        #     # csv_rows.append(csv_row)

    filename = gen_filename(conn, cur)
    write_to_file(filename, measurement_dir, csv_rows, csv_headers)
Example #12
0
                                host='/tmp/',
                                password='******')
    except psycopg2.Error as e:
        print("[EXCEPTION] unable to conenct to database")
        print(e.pgerror)
        exit()
    cur = conn.cursor()

    config = measure.read_config(
        '/home/joon/tabla.db/measurements/config_set2.json')

    csv_headers = ['bench', 'tablesize', 'pagecount']
    csv_rows = []

    for cfg in config:
        sql = SQLGenerator(cfg)
        cur.execute(sql.create_table())

        csv_row = {}
        with open(cfg['filename'], 'r') as f:
            if cfg['bench'] == 'lrmf':
                cur.copy_expert(
                    "COPY " + sql.tablename +
                    " (row, col, val) FROM STDIN CSV", f)
            else:
                cur.execute(
                    'alter table {} alter column features set storage plain;'.
                    format(sql.tablename))
                cur.copy_expert(
                    "COPY " + sql.tablename + " (y, features) FROM STDIN CSV",
                    f)