def test_timer_loop(): num_sleeps = 100 sleep_duration = 0.1 t1 = Timer() t2 = Timer() t1.start() for i in range(0, num_sleeps): t2.start() time.sleep(sleep_duration) t2.stop() t1.stop() # print("Elapsed time {}".format(t1.elapsed())) # print("Elapsed time {}".format(t2.elapsed())) expected_elapsed_time = num_sleeps * sleep_duration assert expected_elapsed_time - 0.1 < t1.elapsed( ) < expected_elapsed_time + 0.1 assert expected_elapsed_time - 0.1 < t2.elapsed( ) < expected_elapsed_time + 0.1
class S3Get(object): def __init__(self): self.s3key = "lineitem.csv" self.timer = Timer() # self.query_bytes = 0 # self.bytes_scanned = 0 # self.bytes_processed = 0 self.bytes_returned = 0 self.records_events = 0 # self.rows_returned = 0 self.timer = Timer() self.time_to_first_record_response = None self.time_to_last_record_response = None def run(self): s3 = boto3.client('s3') self.timer.start() response = s3.get_object( Bucket='s3filter', Key=self.s3key ) streaming_body = response['Body'] bytes_ = streaming_body.read(1024) self.time_to_first_record_response = self.timer.elapsed() while bytes_: self.records_events += 1 self.bytes_returned += len(bytes_) bytes_ = streaming_body.read(1024) self.time_to_last_record_response = self.timer.elapsed() if streaming_body: streaming_body.close() elapsed_time = self.timer.elapsed() print("elapsed_time: {}".format(elapsed_time)) print("time_to_first_record_response: {}".format(self.time_to_first_record_response)) print("time_to_last_record_response: {}".format(self.time_to_last_record_response)) print("bytes_returned: {}".format(self.bytes_returned)) print("records_events: {}".format(self.records_events)) bytes_sec = self.bytes_returned / elapsed_time print("bytes_sec: {} (KB {:f}, MB {:f})".format(bytes_sec, bytes_sec / 1000.0, bytes_sec / 1000000))
def test_datetime_bench_py(): dt = "1970-01-02" timer = Timer() timer.start() for i in range(0, 100000): datetime_util.py_str_to_millis(dt) print(timer.elapsed())
def test_timer_start(): sleep_duration = 0.1 t = Timer() t.start() time.sleep(sleep_duration) # print("Elapsed time {}".format(t.elapsed())) expected_elapsed_time = 1 * sleep_duration assert expected_elapsed_time - 0.1 < t.elapsed( ) < expected_elapsed_time + 0.1
class OpMetrics(object): def __init__(self): self.__timer = Timer() def timer_start(self): self.__timer.start() def timer_stop(self): self.__timer.stop() def elapsed_time(self): return self.__timer.elapsed() def start_time(self): return self.__timer.start_time def timer_running(self): return self.__timer.running @staticmethod def print_metrics(op_list): for op in op_list: OpMetrics.pretty_print(op) @staticmethod def print_overall_metrics(op_list, name=None): total_elapsed = 0 for op in op_list: total_elapsed += op.op_metrics.elapsed_time() if name is None: name = " + ".join([o.name for o in op_list]) print('{}: {}'.format(total_elapsed, name)) @staticmethod def pretty_print(op): print("{}: {}".format(op, op.op_metrics)) def __repr__(self): return {'elapsed_time': round(self.elapsed_time(), 5)}.__repr__()
class S3Select(object): def __init__(self): self.s3key = "lineitem.csv" self.s3sql = "select * from s3Object" self.event_stream = None self.timer = Timer() self.time_to_first_record_response = None self.time_to_last_record_response = None self.query_bytes = 0 self.bytes_scanned = 0 self.bytes_processed = 0 self.bytes_returned = 0 self.records_events = 0 self.rows_returned = 0 self.timer = Timer() self.prev_record_str = None def parse_event(self, event): if 'Records' in event: # print("Records") elapsed_time = self.timer.elapsed() if self.time_to_first_record_response is None: self.time_to_first_record_response = elapsed_time self.time_to_last_record_response = elapsed_time # records_str = event['Records']['Payload'].decode('utf-8') # # records_str_rdr = io.StringIO(records_str) # # for record_str in records_str_rdr: # # # Check record ends with newline (excluding an escaped newline) # if record_str.endswith('\n') and not record_str.endswith('\\n'): # It's a complete record # # if prev_record_str is not None: # There was an incomplete record present in the last payload # # Append the current record to the previous incomplete record # # print("Appending: {} {}".format(prev_record_str, record_str)) # record_str = prev_record_str + record_str # prev_record_str = None # # # print("Complete: {}".format(record_str)) # # # Parse CSV # record_rdr = csv.reader([record_str]) # if PYTHON_3: # record = next(record_rdr) # else: # record = record_rdr.next() # # # Do something with the record # rows_returned += 1 # # else: # # It's an incomplete record, save for next iteration # # print("Incomplete: {}".format(record_str)) # prev_record_str = record_str self.records_events += 1 elif 'Stats' in event: self.bytes_scanned += event['Stats']['Details']['BytesScanned'] self.bytes_processed += event['Stats']['Details']['BytesProcessed'] self.bytes_returned += event['Stats']['Details']['BytesReturned'] print( "Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}" .format(self.bytes_scanned, self.bytes_processed, self.bytes_returned)) elif 'Progress' in event: # pass print("Progress Event") elif 'End' in event: print("End Event") # pass elif 'Cont' in event: # pass print("Cont Event") def run(self): self.query_bytes = len(self.s3key.encode('utf-8')) + len( self.s3sql.encode('utf-8')) s3 = boto3.client('s3') self.timer.start() response = s3.select_object_content(Bucket='s3filter', Key=self.s3key, ExpressionType='SQL', Expression=self.s3sql, InputSerialization={ 'CSV': { 'FileHeaderInfo': 'Use', 'RecordDelimiter': '|\n', 'FieldDelimiter': '|' } }, OutputSerialization={'CSV': {}}) event_stream = response['Payload'] map(self.parse_event, event_stream) if event_stream: event_stream.close() elapsed_time = self.timer.elapsed() print("elapsed_time: {}".format(elapsed_time)) print("time_to_first_record_response: {}".format( self.time_to_first_record_response)) print("time_to_last_record_response: {}".format( self.time_to_last_record_response)) print("query_bytes: {}".format(self.query_bytes)) print("bytes_scanned: {}".format(self.bytes_scanned)) print("bytes_processed: {}".format(self.bytes_processed)) print("bytes_returned: {}".format(self.bytes_returned)) print("records_events: {}".format(self.records_events)) print("rows_returned: {}".format(self.rows_returned)) bytes_sec = self.bytes_returned / elapsed_time print("bytes_sec: {} (KB {:f}, MB {:f})".format( bytes_sec, bytes_sec / 1000.0, bytes_sec / 1000000))
class QueryPlan(object): """Container for the operators in a query. Really a convenience class that allows the plan graph to be generated and the operator execution metrics. """ def __init__(self, system=None, operators=None, is_async=False, buffer_size=1024, use_shared_mem=False): # type: (WorkerSystem, list, bool, int, bool) -> None """ :param operators: """ self.system = system self.use_shared_mem = use_shared_mem self.__timer = Timer() self.total_elapsed_time = 0.0 self.__debug_timer = OrderedDict() self.completion_counter = 0 if operators is None: self.operators = OrderedDict() else: self.operators = OrderedDict() for o in operators: self.operators[o.name] = o self.is_async = is_async self.queue = Queue() self.buffer_size = buffer_size # Cost related metrics self.sql_scanned_bytes = 0 self.sql_returned_bytes = 0 self.returned_bytes = 0 self.num_http_get_requests = 0 def get_operator(self, name): return self.operators[name] def add_operator(self, operator): """Adds the operator to the list of operators in the plan. This method ensures the operators are sorted by operator name - which is important only for retrieving the root operators. :param operator: :return: """ if operator.name in self.operators: raise Exception("Cannot add multiple operators with same name. " "Operator '{}' already added".format(operator.name)) operator.set_query_plan(self) operator.set_buffer_size(self.buffer_size) self.operators[operator.name] = operator sorted_operators = sorted(self.operators.values(), key=lambda o: o.name) self.operators = OrderedDict((o.name, o) for o in sorted_operators) return operator def traverse_depth_first(self, from_operator, visited_operators): """Returns a sorted list of the operators according to where they appear in the query plan using depth first traversal, starting from the supplied operator. :param from_operator: First operator to start from :param visited_operators: Operators already traversed :return: Sorted list of operators """ if from_operator.name not in visited_operators: visited_operators[from_operator.name] = from_operator for c in from_operator.consumers: visited_operators = self.traverse_depth_first(c, visited_operators) return visited_operators @staticmethod def traverse_breadth_first(from_operators): """Returns a sorted list of the operators according to where they appear in the query plan using breadth first traversal, starting from the supplied operators. :param from_operators: The operators to start from :return: Sorted list of operators """ visited_operators = OrderedDict() operators_list = deque(from_operators) while operators_list: o = operators_list.popleft() if o.name not in visited_operators: visited_operators[o.name] = o # Add the consumers except any visited for c in o.consumers: if c.name not in visited_operators: operators_list.append(c) return visited_operators.values() def traverse_topological(self): """This method sorts the operators topgraphically and by operator name. This is to ensure that the operator metrics are written in the order that most closely aligns with the plan. Note that this is not the same order that operators execute in, which may be counterintuitive - e.g. A top operator can complete before a scan. :return: Sorted list of operators """ g = networkx.DiGraph() for o in self.operators.values(): g.add_node(o.name, value=o) for o in self.operators.values(): for c in o.consumers: g.add_edge(o.name, c.name) sorted_operators = list(map(lambda name: g.nodes[name]['value'], networkx.topological_sort(g))) return sorted_operators def traverse_depth_first_from_root(self): """Returns a sorted list of the operators according to where they appear in the query plan, starting from the root operators. :return: Sorted list of operators """ # Find the operators at the root of the graph, they will have no producers root_operators = self.find_root_operators() visited_operators = OrderedDict() for o in root_operators: visited_operators = self.traverse_depth_first(o, visited_operators) return visited_operators.values() def traverse_breadth_first_from_root(self): """Returns a sorted list of the operators according to where they appear in the query plan, starting from the root operators. :return: Sorted list of operators """ # Find the operators at the root of the graph, they will have no producers root_operators = self.find_root_operators() visited_operators = self.traverse_breadth_first(root_operators) return visited_operators def traverse_topological_from_root(self): visited_operators = self.traverse_topological() return visited_operators def find_root_operators(self): root_operators = [] for o in self.operators.values(): if not o.producers: root_operators.append(o) return root_operators def send(self, message, operator_name, sender_op): # NOTE: Have to check that the sender op is async otherwise won't be able to send any messages back to it # (which is needed to send dataframes) if self.use_shared_mem: if type(message) is list: for e in message[0]: if type(e) is pd.DataFrame: self.system.send(operator_name, DataFrameMessage(e), sender_op.worker) else: self.system.send(operator_name, e, sender_op.worker) else: self.system.send(operator_name, message, sender_op.worker) else: if type(message) is list and len(message) <= 0: return else: o = self.operators[operator_name] o.queue.put(cPickle.dumps(message, cPickle.HIGHEST_PROTOCOL)) def print_metrics(self): print("") print("Metrics") print("-------") print("") print("Plan") print("----") print("buffer_size: {}".format(self.buffer_size)) print("is_parallel: {}".format(self.is_async)) print("total_elapsed_time: {}".format(round(self.total_elapsed_time, 5))) print("") print("Cost") print("----") self.print_cost_metrics() print("") print("Operator Completion Time") print("------------------------") for k, v in self.__debug_timer.items(): print("{}: {}".format(k, v)) print("") print("Operators") print("---------") OpMetrics.print_metrics(self.traverse_topological_from_root()) # self.assert_operator_time_equals_plan_time() print("") def print_cost_metrics(self): scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")] for op in scan_operators: if type(op) is TableRangeAccess: self.returned_bytes += op.op_metrics.bytes_returned self.num_http_get_requests += op.op_metrics.num_http_get_requests elif type(op) is TableScan: self.returned_bytes += op.op_metrics.bytes_returned self.num_http_get_requests += op.op_metrics.num_http_get_requests elif type(op) is SQLTableScan: self.sql_returned_bytes += op.op_metrics.bytes_returned self.sql_scanned_bytes += op.op_metrics.bytes_scanned self.num_http_get_requests += op.op_metrics.num_http_get_requests elif type(op) is SQLTableScanBloomUse: self.sql_returned_bytes += op.op_metrics.bytes_returned self.sql_scanned_bytes += op.op_metrics.bytes_scanned self.num_http_get_requests += op.op_metrics.num_http_get_requests else: # raise Exception("Unrecognized scan operator {}".format(type(op))) pass print("sql_scanned_bytes: {}".format(self.sql_scanned_bytes)) print("sql_returned_bytes: {}".format(self.sql_returned_bytes)) print("returned_bytes: {}".format(self.returned_bytes)) print("num_http_get_requests: {}".format(self.num_http_get_requests)) print("") cost, bytes_scanned, bytes_returned, http_requests, rows = self.cost() computation_cost = self.computation_cost() data_cost = self.data_cost()[0] print("total_scanned_bytes: {} MB".format(bytes_scanned * BYTE_TO_MB)) print("total_returned_bytes: {} MB".format(bytes_returned * BYTE_TO_MB)) print("total_http_get_requests: {}".format(http_requests)) print("total_returned_rows: {}".format(rows)) print("computation_cost: ${0:.10f}".format(computation_cost)) print("data_cost: ${0:.10f}".format(data_cost)) print("total_cost: ${0:.10f}".format(cost)) def assert_operator_time_equals_plan_time(self): """Sanity check to make sure cumulative operator exec time approximately equals total plan exec time. We use a margin of error of 0.1 seconds to account for any time not captured during a context switch. Disabled when -O flag is set :return: None """ if __debug__: margin = 0.1 cum_op_time = 0.0 for o in self.operators.values(): cum_op_time += o.op_metrics.elapsed_time() upper_time_bound = self.total_elapsed_time * (1.0 + margin) lower_time_bound = self.total_elapsed_time * (1.0 - margin) assert lower_time_bound <= cum_op_time <= upper_time_bound, \ "Cumulative operator execution time {} is not within {} seconds of total plan execution time {}".format( margin, cum_op_time, self.total_elapsed_time) def write_graph(self, dir_, name): graph = Graph(name) for o in self.operators.values(): graph.add_operator(o) graph.write(dir_) def execute(self): if self.is_async: map(lambda o: o.init_async(self.queue, self.system, self.use_shared_mem), self.operators.values()) if self.use_shared_mem: self.system.start() else: map(lambda o: o.boot(), self.operators.values()) # Find the root operators root_operators = self.find_root_operators() self.__timer.start() # Start the root operators for ro in root_operators: ro.start() # Things become event driven from here, because we may be running async, so we essentially go # into an event loop with a state machine to keep track of where we are # async_operators = {name: o for name, o in filter(lambda (name, o): o.async_, self.operators.items())} if self.is_async: print("Operators: ") print(self.operators) operator_completions = {k: False for k, v in self.operators.items()} while not all(operator_completions.values()): completed_message = self.listen(OperatorCompletedMessage) operator_completions[completed_message.name] = True self.debug_time(completed_message.name) self.__timer.stop() self.total_elapsed_time = self.__timer.elapsed() if self.is_async: # retrieve metrics of all operators. This is important in async queries since the operator runs in a # different process. operators = self.traverse_topological_from_root() for o in operators: if o.async_: if self.use_shared_mem: self.system.send(o.name, EvalMessage("self.op_metrics"), None) else: p_message = pickle.dumps(EvalMessage("self.op_metrics")) o.queue.put(p_message) evaluated_msg = self.listen(EvaluatedMessage) # type: EvaluatedMessage o.op_metrics = evaluated_msg.val else: # Don't need to request metrics pass map(lambda op: op.set_completed(True), self.operators.values()) def listen(self, message_type): # type: (TypeVar[MessageBase]) -> MessageBase try: if self.use_shared_mem: msg = self.system.listen(message_type) return msg else: while True: p_item = self.queue.get() item = cPickle.loads(p_item) # print(item) if type(item) == message_type: return item else: # Not the message being listened for, warn and skip # This isn't exceptional, but the listener should be made aware that there are messages arriving that # are being ignored warning("While listening for message type {} received message type {} with contents {}".format(message_type, type(item), item)) except BaseException as e: tb = traceback.format_exc(e) print(tb) def stop(self): if self.is_async: if self.use_shared_mem: self.system.send_all(StopMessage()) self.system.join() self.system.close() else: map(lambda o: o.queue.put(cPickle.dumps(StopMessage())), filter(lambda o: o.async_, self.operators.values())) self.join() map(lambda o: o.queue.close(), filter(lambda o: o.async_, self.operators.values())) else: pass def join(self): return map(lambda o: o.join(), self.operators.values()) def get_phase_runtime(self, phase_keyword=None): phase_operators = [op for op in self.operators.values() if phase_keyword in op.name] op_start_times = [op.op_metrics.start_time() for op in phase_operators] op_end_times = [op.op_metrics.elapsed_time() + op.op_metrics.start_time() for op in phase_operators] return max(op_end_times) - min(op_start_times) def cost(self): """ calculates the overall query cost when runs on S3 by combining the cost of all scan operators in the query plus the computation cost based on the EC2 instance type and the query running time :return: the estimated cost of the whole query """ total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests, total_rows = self.data_cost() total_compute_cost = self.computation_cost() return total_compute_cost + total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests,\ total_rows def data_cost(self, ec2_region=None, phase_keyword=None): """ calculates the estimated query cost when runs on S3 by combining the cost of all scan operators in the query :return: the estimated cost of the whole query """ if phase_keyword is None: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")] else: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \ phase_keyword in op.name] total_data_cost = 0.0 total_scanned_bytes = 0 total_returned_bytes = 0 total_http_requests = 0 total_rows = 0 for op in scan_operators: if op.is_completed(): total_data_cost += op.op_metrics.data_cost(ec2_region) total_returned_bytes += op.op_metrics.bytes_returned total_scanned_bytes += op.op_metrics.bytes_scanned total_http_requests += op.op_metrics.num_http_get_requests total_rows += op.op_metrics.rows_returned else: raise Exception("Can't calculate query cost while one or more scan operators {} are still executing" .format(op.name)) return total_data_cost, total_scanned_bytes, total_returned_bytes, total_http_requests, total_rows def data_transfer_cost(self, ec2_region=None, phase_keyword=None): if phase_keyword is None: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")] else: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \ phase_keyword in op.name] total_data_transfer_cost = 0.0 total_bytes_returned = 0 total_returned_rows = 0 for op in scan_operators: if op.is_completed(): total_data_transfer_cost += op.op_metrics.data_transfer_cost() total_bytes_returned += op.op_metrics.bytes_returned total_returned_rows += op.op_metrics.rows_returned else: raise Exception("Can't calculate query cost while one or more scan operators {} are still executing" .format(op.name)) return total_bytes_returned, total_returned_rows, total_data_transfer_cost def data_scanning_cost(self, phase_keyword=None): if phase_keyword is None: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")] else: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \ phase_keyword in op.name] total_data_scanning_cost = 0.0 total_bytes_scanned = 0 for op in scan_operators: if op.is_completed(): total_data_scanning_cost += op.op_metrics.data_scan_cost() total_bytes_scanned += op.op_metrics.bytes_scanned else: raise Exception("Can't calculate query cost while one or more scan operators {} are still executing" .format(op.name)) return total_bytes_scanned, total_data_scanning_cost def requests_cost(self, phase_keyword=None): if phase_keyword is None: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost")] else: scan_operators = [op for op in self.operators.values() if hasattr(op.op_metrics, "cost") and \ phase_keyword in op.name] total_http_requests_cost = 0.0 total_http_requests = 0 for op in scan_operators: if op.is_completed(): total_http_requests_cost += op.op_metrics.requests_cost() total_http_requests += op.op_metrics.num_http_get_requests else: raise Exception("Can't calculate query cost while one or more scan operators {} are still executing" .format(op.name)) return total_http_requests, total_http_requests_cost def computation_cost(self, ec2_instance_type=None, os_type=None): """ calculates the estimated computation cost of the query :return: computation cost """ scan_op = None for op in self.operators.values(): if hasattr(op.op_metrics, "cost"): scan_op = op break return scan_op.op_metrics.computation_cost(self.total_elapsed_time, ec2_instance_type, os_type) def estimated_cost_for_config(self, ec2_region, ec2_instance_type=None, os_type=None): """ estimate the hypothetical cost if the query runs on EC2 instance in a different region :param ec2_region: AWSRegion to calculate the cost for :param ec2_instance_type: the code of an EC2 instance as defined by AWS (r4.8xlarge) :param os_type: the name of the os running on the host machine (Linux, Windows ... etc) :return: the hypothetical estimated cost for the provided region, instance type and os """ return self.computation_cost(ec2_instance_type, os_type) + self.data_cost(ec2_region) def debug_time(self, name): self.__debug_timer[name] = self.__timer.elapsed() def retrieve_sampling_threshold(self): sampling_filter_builder = [op for op in self.operators.values() if type(op) == TopKFilterBuild] if len(sampling_filter_builder) > 0: filter_builder = sampling_filter_builder[0] if filter_builder.async_: if self.use_shared_mem: self.system.send(filter_builder.name, EvalMessage("self.threshold"), None) else: p_message = pickle.dumps(EvalMessage("self.threshold")) filter_builder.queue.put(p_message) evaluated_msg = self.listen(EvaluatedMessage) # type: EvaluatedMessage filter_builder.threshold = evaluated_msg.val return filter_builder.threshold
class PandasCursor(object): """Represents a database cursor for managing the context of a fetch operation. Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql string to execute. The rows returned from s3 are returned as an iterator. Importantly supports streaming of records which gives more control over simply passing Python data structures around. """ def __init__(self, s3): self.s3 = s3 self.s3key = None self.s3sql = None self.event_stream = None self.table_local_file_path = None self.need_s3select = True self.timer = Timer() self.time_to_first_record_response = None self.time_to_last_record_response = None self.query_bytes = 0 self.bytes_scanned = 0 self.bytes_processed = 0 self.bytes_returned = 0 self.num_http_get_requests = 0 self.table_data = None self.input = Format.CSV def parquet(self): self.input = Format.PARQUET return self def csv(self): self.input = Format.CSV return self def select(self, s3key, s3sql): """Creates a select cursor :param s3sql: the sql to execute :param s3key: the s3 key to select against :return: the cursor """ self.s3key = s3key self.s3sql = s3sql # TODO:only simple SQL queries are considered. Nested and complex queries will need a lot of work to handle self.need_s3select = not (s3sql.lower().replace(';', '').strip() == 'select * from s3object') # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes self.query_bytes = len(self.s3key.encode('utf-8')) + len(self.s3sql.encode('utf-8')) return self def execute(self): """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could model that API more precisely in future. :return: An iterable of the records fetched """ """ All operations return a CSV file """ print("Executing Pandas cursor!") self.timer.start() self.input = Format.CSV config = TransferConfig( multipart_chunksize=8 * MB, multipart_threshold=8 * MB ) try: #h1 = http.client.HTTPConnection('127.0.0.1:5000') #h1 = http.client.HTTPConnection("3.87.65.94:5000") h1 = http.client.HTTPConnection(FILTER_IP) #print("Connected") h1.request('POST','/' + S3_BUCKET_NAME + '/' + self.s3key, body=self.s3sql) #print("Made request!") r = h1.getresponse() #print(r.read()) r2 = r.read()#.decode("utf-8") #print(r2) #LOST TO DO!! self.table_data = io.BytesIO() self.table_data.write(r2) #print("HELLO!") #print(self.table_data.getvalue().decode('utf-8')) self.num_http_get_requests = PandasCursor.calculate_num_http_requests(self.table_data, config) return self.parse_file() except Exception as e: print(e) #return self.parse_file() def parse_event_stream(self): """Generator that hands out records from the event stream lazily :return: Generator of the records returned from s3 """ prev_record_str = None records_str_rdr = cStringIO.StringIO() for event in self.event_stream: if 'Records' in event: elapsed_time = self.timer.elapsed() if self.time_to_first_record_response is None: self.time_to_first_record_response = elapsed_time self.time_to_last_record_response = elapsed_time # records_str = event['Records']['Payload'].decode('utf-8') records_str = event['Records']['Payload'] if prev_record_str is not None: records_str_rdr.write(prev_record_str) prev_record_str = None if records_str.endswith('\n') and not records_str.endswith('\\n'): records_str_rdr.write(records_str) else: last_newline_pos = records_str.rfind('\n', 0, len(records_str)) if last_newline_pos == -1: prev_record_str = records_str else: prev_record_str = records_str[last_newline_pos + 1:] records_str_rdr.write(records_str[:last_newline_pos + 1]) if records_str_rdr.tell() > 1024 * 1024 * 16: records_str_rdr.seek(0) df = pd.read_csv(records_str_rdr, header=None, prefix='_', dtype=numpy.str, engine='c', quotechar='"', na_filter=False, compression=None, low_memory=False) records_str_rdr.close() records_str_rdr = cStringIO.StringIO() yield df # Strangely the reading with the python csv reader and then loading into a dataframe "seems" faster than # reading csvs with pandas, agate is another option. need to test properly # record_rdr = agate.csv_py2.reader(records_str_rdr) # df = pd.DataFrame(list(record_rdr), dtype=str) # df = df.add_prefix('_') # record_rdr = csv.reader(records_str_rdr) # df = pd.DataFrame(list(record_rdr), dtype=str) # df = df.add_prefix('_') elif 'Stats' in event: self.bytes_scanned += event['Stats']['Details']['BytesScanned'] self.bytes_processed += event['Stats']['Details']['BytesProcessed'] self.bytes_returned += event['Stats']['Details']['BytesReturned'] # print("{} Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}" # .format(timeit.default_timer(), self.bytes_scanned, self.bytes_processed, self.bytes_returned)) elif 'Progress' in event: pass # print("{} Progress Event".format(timeit.default_timer())) elif 'End' in event: # print("{} End Event".format(timeit.default_timer())) if records_str_rdr.tell() > 0: records_str_rdr.seek(0) df = pd.read_csv(records_str_rdr, header=None, prefix='_', dtype=numpy.str, engine='c', quotechar='"', na_filter=False, compression=None, low_memory=False) records_str_rdr.flush() records_str_rdr.close() yield df else: yield pd.DataFrame() return elif 'Cont' in event: pass # print("{} Cont Event".format(timeit.default_timer())) elif 'RequestLevelError' in event: raise Exception(event) # print("{} Cont Event".format(timeit.default_timer())) else: raise Exception("Unrecognized event {}".format(event)) def parse_file(self): #try: if self.input is Format.CSV: if self.table_data and len(self.table_data.getvalue()) > 0: ip_stream = cStringIO.StringIO(self.table_data.getvalue().decode('utf-8')) elif os.path.exists(self.table_local_file_path): ip_stream = self.table_local_file_path else: return self.time_to_first_record_response = self.time_to_last_record_response = self.timer.elapsed() #print("Printing ipstream:") #print(ip_stream.getvalue()) for df in pd.read_csv(ip_stream, delimiter='|', header=None, prefix='_', dtype=numpy.str, engine='c', quotechar='"', na_filter=False, compression=None, low_memory=False, skiprows=1, chunksize=10 ** 7): # Get read bytes self.bytes_returned += ip_stream.tell() # drop last column since the line separator | creates a new empty column at the end of every record df_col_names = list(df) last_col = df_col_names[-1] df.drop(last_col, axis=1, inplace=True) yield df elif self.input is Format.PARQUET: table = pq.read_table(self.table_data) self.table_data = None yield table.to_pandas() else: raise Exception("Unrecognized input type '{}'".format(self.input)) #except Exception as e: # print("can not read table data at with error {}".format(e.message)) # raise e def close(self): """Closes the s3 event stream :return: None """ if self.event_stream: self.event_stream.close() if self.table_data: self.table_data.close() def save_table(self): """ Saves the table data to disk :return: """ if self.table_data: proj_dir = os.environ['PYTHONPATH'].split(":")[0] table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC) create_dirs(table_loc) self.table_local_file_path = os.path.join(table_loc, self.s3key) if not os.path.exists(self.table_local_file_path): create_file_dirs(self.table_local_file_path) with open(self.table_local_file_path, 'w') as table_file: table_file.write(self.table_data.getvalue()) @staticmethod def calculate_num_http_requests(table_data, config): if table_data is not None and len(table_data.getvalue()): shard_max_size = config.multipart_threshold return math.ceil(len(table_data.getvalue()) / (1.0 * shard_max_size)) return 1
class Cursor(object): """Represents a database cursor for managing the context of a fetch operation. Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql string to execute. The rows returned from s3 are returned as an iterator. Importantly supports streaming of records which gives more control over simply passing Python data structures around. """ def __init__(self, s3): self.s3 = s3 self.s3key = None self.s3sql = None self.need_s3select = True self.event_stream = None self.table_local_file_path = None self.timer = Timer() self.time_to_first_record_response = None self.time_to_last_record_response = None self.query_bytes = 0 self.bytes_scanned = 0 self.bytes_processed = 0 self.bytes_returned = 0 self.num_http_get_requests = 0 self.table_data = None def select(self, s3key, s3sql): """Creates a select cursor :param s3sql: the sql to execute :param s3key: the s3 key to select against :return: the cursor """ self.s3key = s3key self.s3sql = s3sql # TODO:only simple SQL queries are considered. Nested and complex queries will need a lot of work to handle self.need_s3select = not (s3sql.lower().replace(';', '').strip() == 'select * from s3object') # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes self.query_bytes = len(self.s3key.encode('utf-8')) + len( self.s3sql.encode('utf-8')) return self def execute(self): """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could model that API more precisely in future. :return: An iterable of the records fetched """ # print("Executing select_object_content") self.timer.start() if not self.need_s3select: proj_dir = os.environ['PYTHONPATH'].split(":")[0] table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC) if not os.path.exists(table_loc): os.makedirs(table_loc) self.table_local_file_path = os.path.join(table_loc, self.s3key) if not os.path.exists( self.table_local_file_path) or not USE_CACHED_TABLES: config = TransferConfig(multipart_chunksize=8 * MB, multipart_threshold=8 * MB) self.table_data = io.BytesIO() self.s3.download_fileobj(Bucket=S3_BUCKET_NAME, Key=self.s3key, Fileobj=self.table_data, Config=config) self.num_http_get_requests = Cursor.calculate_num_http_requests( self.table_data, config) return self.parse_file() else: # Note: # # CSV files use | as a delimiter and have a trailing delimiter so record delimiter is |\n # # NOTE: As responses are chunked the file headers are only returned in the first chunk. # We ignore them for now just because its simpler. It does mean the records are returned as a list # instead of a dict though (can change in future). # response = self.s3.select_object_content( Bucket=S3_BUCKET_NAME, Key=self.s3key, ExpressionType='SQL', Expression=self.s3sql, InputSerialization={ 'CSV': { 'FileHeaderInfo': 'Use', 'RecordDelimiter': '|\n', 'FieldDelimiter': '|' } }, OutputSerialization={'CSV': {}}) self.event_stream = response['Payload'] self.num_http_get_requests = 1 return self.parse_event_stream() def parse_event_stream(self): """Generator that hands out records from the event stream lazily :return: Generator of the records returned from s3 """ prev_record_str = None # first_record = True for event in self.event_stream: if 'Records' in event: elapsed_time = self.timer.elapsed() if self.time_to_first_record_response is None: self.time_to_first_record_response = elapsed_time self.time_to_last_record_response = elapsed_time records_str = event['Records']['Payload'].decode('utf-8') # print("{} Records Event {}".format(timeit.default_timer(), records_str)) # print("{} Records Event".format(timeit.default_timer())) # Note that the select_object_content service may return partial chunks so we cant simply pass the # str to the csv reader. We need to examine the string itself to see if it's an incomplete record # (it won't end with a newline if its incomplete) records_str_rdr = io.StringIO(records_str) for record_str in records_str_rdr: # Check record ends with newline (excluding an escaped newline) if record_str.endswith('\n') and not record_str.endswith( '\\n'): # It's a complete record if prev_record_str is not None: # There was an incomplete record present in the last payload # Append the current record to the previous incomplete record # print("Appending: {} {}".format(prev_record_str, record_str)) record_str = prev_record_str + record_str prev_record_str = None # print("Complete: {}".format(record_str)) # Parse CSV record_rdr = csv.reader([record_str]) if PYTHON_3: record = next(record_rdr) else: record = record_rdr.next() # print("Parsed: {}".format(record)) # if first_record: # # This will be the headers, keep them around # header = record # first_record = False # else: # record_dict = dict(zip(header, record)) # yield record_dict # TODO: Not sure how to handle timers exactly, while time will be taken yielding it may # genuinely be time that this method is waiting for data from s3. May be a case of measuring # both, something to think about. # self.timer.stop() yield record # self.timer.start() else: # It's an incomplete record, save for next iteration # print("Incomplete: {}".format(record_str)) prev_record_str = record_str elif 'Stats' in event: self.bytes_scanned += event['Stats']['Details']['BytesScanned'] self.bytes_processed += event['Stats']['Details'][ 'BytesProcessed'] self.bytes_returned += event['Stats']['Details'][ 'BytesReturned'] # print("{} Stats Event: bytes scanned: {}, bytes processed: {}, bytes returned: {}" # .format(timeit.default_timer(), self.bytes_scanned, self.bytes_processed, self.bytes_returned)) elif 'Progress' in event: pass # print("{} Progress Event".format(timeit.default_timer())) elif 'End' in event: # print("{} End Event".format(timeit.default_timer())) return elif 'Cont' in event: pass # print("{} Cont Event".format(timeit.default_timer())) elif 'RequestLevelError' in event: raise Exception(event) # print("{} Cont Event".format(timeit.default_timer())) else: raise Exception("Unrecognized event {}".format(event)) def parse_file(self): try: if self.table_data and len(self.table_data.getvalue()) > 0: ip_stream = io.StringIO( self.table_data.getvalue().decode('utf-8')) elif os.path.exists(self.table_local_file_path): ip_stream = open(self.table_local_file_path, 'r') else: return self.time_to_first_record_response = self.time_to_last_record_response = self.timer.elapsed( ) # with open(self.table_local_file_path, 'r', buffering=(2 << 16) + 8) as table_file: header_row = True for t in ip_stream: if header_row: header_row = False continue tup = t.split('|')[:-1] yield tup except Exception as e: print("can not open table file at {} with error {}".format( self.table_local_file_path, e.message)) raise e finally: ip_stream.close() def close(self): """Closes the s3 event stream :return: None """ if self.event_stream: self.event_stream.close() if self.table_data: self.table_data.close() def save_table(self): """ Saves the table data to disk :return: """ if self.table_data: proj_dir = os.environ['PYTHONPATH'].split(":")[0] table_loc = os.path.join(proj_dir, TABLE_STORAGE_LOC) create_dirs(table_loc) self.table_local_file_path = os.path.join(table_loc, self.s3key) if not os.path.exists(self.table_local_file_path): create_file_dirs(self.table_local_file_path) with open(self.table_local_file_path, 'w') as table_file: table_file.write(self.table_data.getvalue()) @staticmethod def calculate_num_http_requests(table_data, config): if table_data is not None and len(table_data.getvalue()): shard_max_size = config.multipart_threshold return math.ceil( len(table_data.getvalue()) / (1.0 * shard_max_size)) return 1
def run(use_pandas, secure, use_native): print("Cursor | Settings {}".format({ 'use_pandas': use_pandas, 'secure': secure, 'use_native': use_native })) key = 'sf1000-lineitem/lineitem_27.csv' sql = 'select * from S3Object limit 10000' boto3.set_stream_logger('', logging.DEBUG) if secure: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10) session = Session() s3 = session.client('s3', config=cfg) else: cfg = Config(region_name="us-east-1", parameter_validation=False, max_pool_connections=10, s3={'payload_signing_enabled': False}) session = Session() s3 = session.client('s3', use_ssl=False, verify=False, config=cfg) if use_pandas and not use_native: cur = PandasCursor(s3).select(key, sql) dfs = cur.execute() rows_returned = 0 for df in dfs: rows_returned += len(df) print("{}".format({ 'elapsed_time': round(cur.timer.elapsed(), 5), 'rows_returned': rows_returned, 'query_bytes': cur.query_bytes, 'bytes_scanned': cur.bytes_scanned, 'bytes_processed': cur.bytes_processed, 'bytes_returned': "{} ({} MB / {} GB)".format( cur.bytes_returned, round(float(cur.bytes_returned) / 1000000.0, 5), round(float(cur.bytes_returned) / 1000000000.0, 5)), 'bytes_returned_per_sec': "{} ({} MB / {} GB)".format( round(float(cur.bytes_returned) / cur.timer.elapsed(), 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000, 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000000, 5)), 'time_to_first_record_response': None if cur.time_to_first_record_response is None else round( cur.time_to_first_record_response, 5), 'time_to_last_record_response': None if cur.time_to_last_record_response is None else round( cur.time_to_last_record_response, 5) })) elif use_pandas and use_native: closure = {'df': None, 'rows_returned': 0} def on_data(data): # print("|||") # print(type(data)) # print(data) # print("|||") # if closure['df'] is None: # closure['df'] = pd.DataFrame(data) # else: # closure['df'] = pd.concat([closure['df'], pd.DataFrame(data)], ignore_index=True, copy=False) closure['rows_returned'] += len(pd.DataFrame(data)) cur = NativeCursor(scan).select(key, sql) cur.execute(on_data) rows_returned = closure['rows_returned'] print("{}".format({ 'elapsed_time': round(cur.timer.elapsed(), 5), 'rows_returned': rows_returned, 'query_bytes': cur.query_bytes, 'bytes_scanned': cur.bytes_scanned, 'bytes_processed': cur.bytes_processed, 'bytes_returned': "{} ({} MB / {} GB)".format( cur.bytes_returned, round(float(cur.bytes_returned) / 1000000.0, 5), round(float(cur.bytes_returned) / 1000000000.0, 5)), 'bytes_returned_per_sec': "{} ({} MB / {} GB)".format( round(float(cur.bytes_returned) / cur.timer.elapsed(), 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000, 5), round( float(cur.bytes_returned) / cur.timer.elapsed() / 1000000000, 5)), 'time_to_first_record_response': None if cur.time_to_first_record_response is None else round( cur.time_to_first_record_response, 5), 'time_to_last_record_response': None if cur.time_to_last_record_response is None else round( cur.time_to_last_record_response, 5) })) else: timer = Timer() timer.start() cur = Cursor(s3).select(key, sql) rows = cur.execute() for _ in rows: pass print(timer.elapsed()) print("Cursor | Done")
class NativeCursor(object): """Represents a database cursor for managing the context of a fetch operation. Intended to be modelled after the Python DB API. All it supports at present is select, taking a s3 key and sql string to execute. The rows returned from s3 are returned as an iterator. Importantly supports streaming of records which gives more control over simply passing Python data structures around. """ def __init__(self, fast_s3): self.fast_s3 = fast_s3 self.s3key = None self.s3sql = None self.timer = Timer() self.time_to_first_record_response = None self.time_to_last_record_response = None self.query_bytes = 0 self.bytes_scanned = 0 self.bytes_processed = 0 self.bytes_returned = 0 self.num_http_get_requests = 0 def select(self, s3key, s3sql): """Creates a select cursor :param s3sql: the sql to execute :param s3key: the s3 key to select against :return: the cursor """ self.s3key = s3key self.s3sql = s3sql # There doesn't seem to be a way to capture the bytes sent to s3, but we can use this for comparison purposes self.query_bytes = len(self.s3key.encode('utf-8')) + len( self.s3sql.encode('utf-8')) return self def execute(self, on_data): """Executes the fetch operation. This is different to the DB API as it returns an iterable. Of course we could model that API more precisely in future. :return: An iterable of the records fetched """ # print("Executing select_object_content") # self.last_df = None def on_records(np_array): # print("|||") # print(type(np_array)) # print(np_array) # print("|||") elapsed_time = self.timer.elapsed() if self.time_to_first_record_response is None: self.time_to_first_record_response = elapsed_time self.time_to_last_record_response = elapsed_time df = pd.DataFrame(np_array) df = df.add_prefix('_') # self.last_df = df on_data(df) self.timer.start() self.fast_s3.execute(self.s3key, self.s3sql, on_records) self.bytes_scanned = self.fast_s3.get_bytes_scanned() self.bytes_processed = self.fast_s3.get_bytes_processed() self.bytes_returned = self.fast_s3.get_bytes_returned()