def __init__(self, s3key, s3sql, use_pandas, name, query_plan, log_enabled): """Creates a new Table Scan operator using the given s3 object key and s3 select sql :param s3key: The object key to select against :param s3sql: The s3 select sql """ super(SQLPandasTableScan, self).__init__(name, SQLTableScanMetrics(), query_plan, log_enabled) self.s3 = query_plan.s3 self.s3key = s3key self.s3sql = s3sql self.use_pandas = use_pandas
def __init__(self, s3key, s3sql, name, parts, partitioning_key, query_plan, log_enabled): """Creates a new Table Scan operator that executes the given query on the table given in s3key in parallel. The parallelism factor is passed in the parts parameter. The table partitioning will be based on the key passed in split_on_key parameter :param s3key: The object key to select against :param s3sql: The s3 select sql :param parts: The parallelism factor (number of downloading threads) :param partitioning_key: The key used to partition the table records in order to distributed over workers """ super(SQLMultiprocessingParallelTableScan, self).__init__(name, SQLTableScanMetrics(), query_plan, log_enabled) self.s3key = s3key self.s3sql = s3sql self.parts = parts self.partitioning_key = partitioning_key self.log_enabled = log_enabled self.ranges = [] self.is_streamed = False # determining the range of the partitioning key values to be able to find the splitting intervals range_sql = ''' SELECT MIN(CAST({} AS int)) AS min_key, MAX(CAST({} AS int)) AS max_key \ FROM S3Object '''.format(self.partitioning_key, self.partitioning_key) self.min_key = 1 #l_orderkey min value self.max_key = 6000000 #l_orderkey max value self.ranges = [] self.part_size = (self.max_key - self.min_key + 1) / self.parts for part in range(self.parts): start = part * self.part_size + 1 end = start + self.part_size - 1 self.ranges.append((start, end)) self.ranges[-1] = (self.ranges[-1][0], self.max_key) manager = Manager() self.records = manager.list() self.worker_metrics = manager.dict()
def __init__(self, s3key, s3sql, name, parts, log_enabled): """Creates a new Table Scan operator that executes the given query on the table given in s3key in parallel. The parallelism factor is passed in the parts parameter. The table partitioning will be based on the key passed in split_on_key parameter :param s3key: The object key to select against :param s3sql: The s3 select sql :param parts: The parallelism factor (number of downloading threads) """ super(SQLRayParallelShardedTableScan, self).__init__(name, SQLTableScanMetrics(), log_enabled) self.s3key = s3key self.s3sql = s3sql self.parts = parts self.log_enabled = log_enabled self.ranges = [] self.is_streamed = False self.records = [] self.worker_metrics = {} ray.init()
def __init__(self, s3key, s3sql, use_pandas, secure, use_native, name, shards, shard_prefix, parallel_shards, query_plan, log_enabled): """Creates a new Table Scan operator using the given s3 object key and s3 select sql :param s3key: The object key to select against :param s3sql: The s3 select sql """ super(SQLShardedTableScan, self).__init__(name, SQLTableScanMetrics(), query_plan, log_enabled) self.s3key = s3key self.s3sql = s3sql self.secure = secure self.field_names = None self.use_pandas = use_pandas self.use_native = use_native self.shard_prefix = shard_prefix self.shards = shards self.parallel_shards = parallel_shards self.query_plan = query_plan self.shard_scanner_ops = [] for shard in shards: shard_key_name = self.get_part_key(shard) shard_table_scanner = self.query_plan.add_operator( SQLTableScan(shard_key_name, self.s3sql, self.use_pandas, self.secure, self.use_native, "shard_table_scan_{}".format(shard), self.query_plan, self.log_enabled)) shard_table_scanner.connect(self) self.shard_scanner_ops.append(shard_table_scanner) if self.query_plan.is_async and self.parallel_shards: shard_table_scanner.init_async(self.query_plan.queue)
def download_part(self, part, records_queue, stats): print('Started downloading part {}'.format(part)) part_range = self.ranges[part] part_sql = self.s3sql + ' and CAST({} AS int) >= {} AND CAST({} AS int) <= {}'.format(self.partitioning_key, part_range[0], self.partitioning_key, part_range[1]) op_metrics = SQLTableScanMetrics() op_metrics.timer_start() cur = Cursor(self.query_plan.s3).select(self.s3key, part_sql) tuples = cur.execute() op_metrics.query_bytes = cur.query_bytes op_metrics.time_to_first_response = op_metrics.elapsed_time() for t in tuples: op_metrics.rows_returned += 1 tuple_msg = TupleMessage(Tuple(t)) records_queue.append(tuple_msg) del tuples op_metrics.bytes_scanned = cur.bytes_scanned op_metrics.bytes_processed = cur.bytes_processed op_metrics.bytes_returned = cur.bytes_returned op_metrics.time_to_first_record_response = cur.time_to_first_record_response op_metrics.time_to_last_record_response = cur.time_to_last_record_response op_metrics.timer_stop() stats[part] = op_metrics print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned))
def __init__(self, s3key, s3sql, bloom_filter_field_name, format_, use_pandas, secure, use_native, name, query_plan, log_enabled, fn=None): """ :param s3key: The s3 key to select against :param s3sql: The s3 sql to use :param bloom_filter_field_name: The field name to apply to the bloom filter predicate :param name: The name of the operator :param log_enabled: Whether logging is enabled """ super(SQLTableScanBloomUse, self).__init__(name, SQLTableScanMetrics(), query_plan, log_enabled) self.s3key = s3key self.s3sql = s3sql self.fn = fn if not use_native: if secure: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() self.s3 = session.client('s3', config=cfg) else: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10, s3={'payload_signing_enabled': False}) session = Session() self.s3 = session.client('s3', use_ssl=False, verify=False, config=cfg) # else : # self.fast_s3 = scan self.use_native = use_native self.use_pandas = use_pandas self.__field_names = None self.__tuples = [] self.__bloom_filters = [] self.filter_fn = fn self.format_ = format_ if type(bloom_filter_field_name) is str: self.__bloom_filter_field_name = bloom_filter_field_name else: raise Exception( "Bloom filter field name is of type {}. Field name must be of type str to be " "used in SQL predicate".format(type(bloom_filter_field_name)))
def download_part(self, part, part_key, records_queue, stats): print('Started downloading part {} key {}'.format(part, part_key)) op_metrics = SQLTableScanMetrics() op_metrics.timer_start() cur = Cursor().select(part_key, self.s3sql) tuples = cur.execute() op_metrics.query_bytes = cur.query_bytes op_metrics.time_to_first_response = op_metrics.elapsed_time() for t in tuples: op_metrics.rows_returned += 1 tuple_msg = TupleMessage(Tuple(t)) records_queue.append(tuple_msg) del tuples op_metrics.bytes_scanned = cur.bytes_scanned op_metrics.bytes_processed = cur.bytes_processed op_metrics.bytes_returned = cur.bytes_returned op_metrics.time_to_first_record_response = cur.time_to_first_record_response op_metrics.time_to_last_record_response = cur.time_to_last_record_response op_metrics.timer_stop() stats[part] = op_metrics print('Finished downloading part {} read {} records'.format( part, op_metrics.rows_returned))
def download_part_local(s3sql, part, part_key): print('Started downloading part {} key {}'.format(part, part_key)) op_metrics = SQLTableScanMetrics() op_metrics.timer_start() cur = Cursor().select(part_key, s3sql) tuples = cur.execute() op_metrics.query_bytes = cur.query_bytes op_metrics.time_to_first_response = op_metrics.elapsed_time() records = [] for t in tuples: op_metrics.rows_returned += 1 tpl = Tuple(t) # tuple_msg = TupleMessage(tpl) # records.append(tuple_msg) records.append(tpl) del tuples op_metrics.bytes_scanned = cur.bytes_scanned op_metrics.bytes_processed = cur.bytes_processed op_metrics.bytes_returned = cur.bytes_returned op_metrics.time_to_first_record_response = cur.time_to_first_record_response op_metrics.time_to_last_record_response = cur.time_to_last_record_response op_metrics.timer_stop() print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned)) return np.array(records), part, op_metrics