Exemple #1
0
    def __init__(self, s3key, s3sql, use_pandas, name, query_plan,
                 log_enabled):
        """Creates a new Table Scan operator using the given s3 object key and s3 select sql

        :param s3key: The object key to select against
        :param s3sql: The s3 select sql
        """

        super(SQLPandasTableScan, self).__init__(name, SQLTableScanMetrics(),
                                                 query_plan, log_enabled)

        self.s3 = query_plan.s3

        self.s3key = s3key
        self.s3sql = s3sql

        self.use_pandas = use_pandas
Exemple #2
0
    def __init__(self, s3key, s3sql, name, parts, partitioning_key, query_plan, log_enabled):
        """Creates a new Table Scan operator that executes the given query on the table given in s3key in parallel.
        The parallelism factor is passed in the parts parameter. The table partitioning will be based on the key
        passed in split_on_key parameter

        :param s3key: The object key to select against
        :param s3sql: The s3 select sql
        :param parts: The parallelism factor (number of downloading threads)
        :param partitioning_key: The key used to partition the table records in order to distributed over workers
        """

        super(SQLMultiprocessingParallelTableScan, self).__init__(name, SQLTableScanMetrics(), query_plan, log_enabled)

        self.s3key = s3key
        self.s3sql = s3sql
        self.parts = parts
        self.partitioning_key = partitioning_key
        self.log_enabled = log_enabled
        self.ranges = []
        self.is_streamed = False

        # determining the range of the partitioning key values to be able to find the splitting intervals
        range_sql = '''
                    SELECT MIN(CAST({} AS int)) AS min_key, MAX(CAST({} AS int)) AS max_key \
                    FROM S3Object
        '''.format(self.partitioning_key, self.partitioning_key)

        self.min_key = 1        #l_orderkey min value
        self.max_key = 6000000  #l_orderkey max value
        self.ranges = []

        self.part_size = (self.max_key - self.min_key + 1) / self.parts
        for part in range(self.parts):
            start = part * self.part_size + 1
            end = start + self.part_size - 1
            self.ranges.append((start, end))
        self.ranges[-1] = (self.ranges[-1][0], self.max_key)

        manager = Manager()
        self.records = manager.list()
        self.worker_metrics = manager.dict()
Exemple #3
0
    def __init__(self, s3key, s3sql, name, parts, log_enabled):
        """Creates a new Table Scan operator that executes the given query on the table given in s3key in parallel.
        The parallelism factor is passed in the parts parameter. The table partitioning will be based on the key
        passed in split_on_key parameter

        :param s3key: The object key to select against
        :param s3sql: The s3 select sql
        :param parts: The parallelism factor (number of downloading threads)
        """

        super(SQLRayParallelShardedTableScan, self).__init__(name, SQLTableScanMetrics(), log_enabled)

        self.s3key = s3key
        self.s3sql = s3sql
        self.parts = parts
        self.log_enabled = log_enabled
        self.ranges = []
        self.is_streamed = False
        self.records = []
        self.worker_metrics = {}

        ray.init()
Exemple #4
0
    def __init__(self, s3key, s3sql, use_pandas, secure, use_native, name,
                 shards, shard_prefix, parallel_shards, query_plan,
                 log_enabled):
        """Creates a new Table Scan operator using the given s3 object key and s3 select sql
        :param s3key: The object key to select against
        :param s3sql: The s3 select sql
        """

        super(SQLShardedTableScan, self).__init__(name, SQLTableScanMetrics(),
                                                  query_plan, log_enabled)

        self.s3key = s3key
        self.s3sql = s3sql
        self.secure = secure
        self.field_names = None

        self.use_pandas = use_pandas
        self.use_native = use_native
        self.shard_prefix = shard_prefix
        self.shards = shards
        self.parallel_shards = parallel_shards
        self.query_plan = query_plan

        self.shard_scanner_ops = []

        for shard in shards:
            shard_key_name = self.get_part_key(shard)
            shard_table_scanner = self.query_plan.add_operator(
                SQLTableScan(shard_key_name, self.s3sql, self.use_pandas,
                             self.secure, self.use_native,
                             "shard_table_scan_{}".format(shard),
                             self.query_plan, self.log_enabled))
            shard_table_scanner.connect(self)
            self.shard_scanner_ops.append(shard_table_scanner)

            if self.query_plan.is_async and self.parallel_shards:
                shard_table_scanner.init_async(self.query_plan.queue)
Exemple #5
0
    def download_part(self, part, records_queue, stats):
        print('Started downloading part {}'.format(part))
        part_range = self.ranges[part]
        part_sql = self.s3sql + ' and CAST({} AS int) >= {} AND CAST({} AS int) <= {}'.format(self.partitioning_key,
                                                                                                part_range[0],
                                                                                                self.partitioning_key,
                                                                                                part_range[1])

        op_metrics = SQLTableScanMetrics()
        op_metrics.timer_start()

        cur = Cursor(self.query_plan.s3).select(self.s3key, part_sql)

        tuples = cur.execute()

        op_metrics.query_bytes = cur.query_bytes
        op_metrics.time_to_first_response = op_metrics.elapsed_time()

        for t in tuples:
            op_metrics.rows_returned += 1
            tuple_msg = TupleMessage(Tuple(t))
            records_queue.append(tuple_msg)

        del tuples

        op_metrics.bytes_scanned = cur.bytes_scanned
        op_metrics.bytes_processed = cur.bytes_processed
        op_metrics.bytes_returned = cur.bytes_returned
        op_metrics.time_to_first_record_response = cur.time_to_first_record_response
        op_metrics.time_to_last_record_response = cur.time_to_last_record_response

        op_metrics.timer_stop()
        stats[part] = op_metrics

        print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned))
Exemple #6
0
    def __init__(self,
                 s3key,
                 s3sql,
                 bloom_filter_field_name,
                 format_,
                 use_pandas,
                 secure,
                 use_native,
                 name,
                 query_plan,
                 log_enabled,
                 fn=None):
        """

        :param s3key: The s3 key to select against
        :param s3sql:  The s3 sql to use
        :param bloom_filter_field_name: The field name to apply to the bloom filter predicate
        :param name: The name of the operator
        :param log_enabled: Whether logging is enabled
        """

        super(SQLTableScanBloomUse, self).__init__(name, SQLTableScanMetrics(),
                                                   query_plan, log_enabled)

        self.s3key = s3key
        self.s3sql = s3sql
        self.fn = fn

        if not use_native:
            if secure:
                cfg = Config(region_name="us-east-1",
                             parameter_validation=False,
                             max_pool_connections=10)
                session = Session()
                self.s3 = session.client('s3', config=cfg)
            else:
                cfg = Config(region_name="us-east-1",
                             parameter_validation=False,
                             max_pool_connections=10,
                             s3={'payload_signing_enabled': False})
                session = Session()
                self.s3 = session.client('s3',
                                         use_ssl=False,
                                         verify=False,
                                         config=cfg)
        # else :
        #     self.fast_s3 = scan

        self.use_native = use_native
        self.use_pandas = use_pandas

        self.__field_names = None
        self.__tuples = []

        self.__bloom_filters = []

        self.filter_fn = fn

        self.format_ = format_

        if type(bloom_filter_field_name) is str:
            self.__bloom_filter_field_name = bloom_filter_field_name
        else:
            raise Exception(
                "Bloom filter field name is of type {}. Field name must be of type str to be "
                "used in SQL predicate".format(type(bloom_filter_field_name)))
Exemple #7
0
    def download_part(self, part, part_key, records_queue, stats):
        print('Started downloading part {} key {}'.format(part, part_key))

        op_metrics = SQLTableScanMetrics()
        op_metrics.timer_start()

        cur = Cursor().select(part_key, self.s3sql)

        tuples = cur.execute()

        op_metrics.query_bytes = cur.query_bytes
        op_metrics.time_to_first_response = op_metrics.elapsed_time()

        for t in tuples:
            op_metrics.rows_returned += 1
            tuple_msg = TupleMessage(Tuple(t))
            records_queue.append(tuple_msg)

        del tuples

        op_metrics.bytes_scanned = cur.bytes_scanned
        op_metrics.bytes_processed = cur.bytes_processed
        op_metrics.bytes_returned = cur.bytes_returned
        op_metrics.time_to_first_record_response = cur.time_to_first_record_response
        op_metrics.time_to_last_record_response = cur.time_to_last_record_response

        op_metrics.timer_stop()
        stats[part] = op_metrics

        print('Finished downloading part {} read {} records'.format(
            part, op_metrics.rows_returned))
Exemple #8
0
def download_part_local(s3sql, part, part_key):
    print('Started downloading part {} key {}'.format(part, part_key))

    op_metrics = SQLTableScanMetrics()
    op_metrics.timer_start()

    cur = Cursor().select(part_key, s3sql)

    tuples = cur.execute()

    op_metrics.query_bytes = cur.query_bytes
    op_metrics.time_to_first_response = op_metrics.elapsed_time()

    records = []
    for t in tuples:
        op_metrics.rows_returned += 1
        tpl = Tuple(t)
        # tuple_msg = TupleMessage(tpl)
        # records.append(tuple_msg)
        records.append(tpl)

    del tuples

    op_metrics.bytes_scanned = cur.bytes_scanned
    op_metrics.bytes_processed = cur.bytes_processed
    op_metrics.bytes_returned = cur.bytes_returned
    op_metrics.time_to_first_record_response = cur.time_to_first_record_response
    op_metrics.time_to_last_record_response = cur.time_to_last_record_response

    op_metrics.timer_stop()

    print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned))

    return np.array(records), part, op_metrics