def start(self): self.op_metrics.timer_start() it = IndexedTuple.build_default(self.col_defs) if self.log_enabled: print("{}('{}') | Sending field names: {}".format( self.__class__.__name__, self.name, it.field_names())) self.send(TupleMessage(Tuple(it.field_names())), self.consumers) for i in range(0, self.num_rows): if self.is_completed(): break self.op_metrics.rows_returned += 1 t = Tuple() for col_def in self.col_defs: col_val = col_def.generate() t.append(col_val) if self.log_enabled: print("{}('{}') | Sending field values: {}".format( self.__class__.__name__, self.name, t)) self.send(TupleMessage(t), self.consumers) if not self.is_completed(): self.complete() self.op_metrics.timer_stop()
def complete(self): """ When all producers complete, the topk tuples are passed to the next operators. :return: """ if not self.use_pandas: # if the number of tuples beyond the cut-off value is less than k, we need to some tuples from # the sample set if len(self.heap) < self.max_tuples: self.on_receive([TupleMessage(t) for t in self.sample_tuples], self.name) for t in self.heap.get_topk(self.max_tuples, sort=True): if self.is_completed(): break self.send(TupleMessage(t.tuple), self.consumers) self.heap.clear() else: if self.sort_expression.sort_order == 'ASC': self.global_topk_df = self.global_topk_df.nsmallest(self.max_tuples, self.sort_expression.col_index) \ .head(self.max_tuples) elif self.sort_expression.sort_order == 'DESC': self.global_topk_df = self.global_topk_df.nlargest(self.max_tuples, self.sort_expression.col_index) \ .head(self.max_tuples) self.send(self.global_topk_df, self.consumers) super(TopKTableScan, self).complete() self.op_metrics.timer_stop()
def execute_py_query(op): cur = Cursor(op.s3).select(op.s3key, op.s3sql) tuples = cur.execute() first_tuple = True for t in tuples: if op.is_completed(): break op.op_metrics.rows_returned += 1 if first_tuple: # Create and send the record field names it = IndexedTuple.build_default(t) first_tuple = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, it.field_names())) op.send(TupleMessage(Tuple(it.field_names())), op.consumers) # if op.log_enabled: # print("{}('{}') | Sending field values: {}".format(op.__class__.__name__, op.name, t)) op.send(TupleMessage(Tuple(t)), op.consumers) return cur
def __on_receive_tuple(self, tuple_, producer_name): """Event handler for a received tuple :param tuple_: The received tuple :return: None """ if self.field_names is None: self.field_names = tuple_ self.send(TupleMessage(tuple_), self.consumers) self.producers_received[producer_name] = True else: if producer_name not in self.producers_received.keys(): # Will be field names, skip self.producers_received[producer_name] = True else: it = IndexedTuple.build(tuple_, self.field_names) idx = int(it[self.map_field_name]) % len(self.consumers) self.op_metrics.rows_mapped += 1 self.send(TupleMessage(tuple_), [self.consumers[idx]])
def on_producer_completed(self, producer_name): """Event handler for a producer completion event. :param producer_name: The producer that completed. :return: None """ if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True if self.use_pandas: if not self.is_completed() and all(self.producer_completions.values()): if len(self.agg_df > 0): self.send(DataFrameMessage(self.agg_df.agg(['sum'])), self.consumers) else: self.send(DataFrameMessage(pd.DataFrame()), self.consumers) else: if not self.is_completed() and all(self.producer_completions.values()): # Build and send the field names field_names = self.__build_field_names() self.send(TupleMessage(Tuple(field_names)), self.consumers) # Send the field values, if there are any if self.__expression_contexts is not None: field_values = self.__build_field_values() self.send(TupleMessage(Tuple(field_values)), self.consumers) Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """Handles the event where the producer has completed producing all the tuples it will produce. Once this occurs the tuples can be sent to consumers downstream. :param producer_name: The producer that has completed :return: None """ if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True else: raise Exception( "Unrecognized producer {} has completed".format(producer_name)) is_all_producers_done = all(self.producer_completions.values()) if not is_all_producers_done: return if not self.use_pandas: # Send the field names lt = IndexedTuple.build_default(self.group_field_names + self.aggregate_expressions) self.send(TupleMessage(Tuple(lt.field_names())), self.consumers) for group_tuple, group_aggregate_contexts in self.group_contexts.items( ): if self.is_completed(): break # Convert the aggregate contexts to their results group_fields = list(group_tuple) group_aggregate_values = list( v.result for v in group_aggregate_contexts.values()) t_ = group_fields + group_aggregate_values self.send(TupleMessage(Tuple(t_)), self.consumers) else: # for groupby_reducer, aggregate one more time. if not self.is_completed() and len(self.producers) > 1: self.aggregate_df = self.pd_expr(self.aggregate_df) if not self.is_completed() and self.aggregate_df is not None: self.aggregate_df.reset_index(drop=True, inplace=True) # if self.log_enabled: # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print("{}('{}') | Sending grouped field values: \n{}" # .format(self.__class__.__name__, self.name, self.aggregate_df)) #self.send(TupleMessage(Tuple(list(self.aggregate_df))), self.consumers) self.send(DataFrameMessage(self.aggregate_df), self.consumers) del self.aggregate_df Operator.on_producer_completed(self, producer_name)
def __on_receive_tuple(self, tuple_, producer_name): """Handles the receipt of a tuple. When a tuple is received, it's compared with the top of the heap to decide on adding to the heap or skip it. Given this process, it is guaranteed to keep the k topmost tuples given some defined comparison criteria :param tuple_: The received tuple :return: None """ if not self.field_names: # Collect and send field names through self.field_names = tuple_ self.send(TupleMessage(tuple_), self.consumers) elif not is_header(tuple_): self.send(TupleMessage(tuple_), self.consumers)
def download_part(self, part, records_queue, stats): print('Started downloading part {}'.format(part)) part_range = self.ranges[part] part_sql = self.s3sql + ' and CAST({} AS int) >= {} AND CAST({} AS int) <= {}'.format(self.partitioning_key, part_range[0], self.partitioning_key, part_range[1]) op_metrics = SQLTableScanMetrics() op_metrics.timer_start() cur = Cursor(self.query_plan.s3).select(self.s3key, part_sql) tuples = cur.execute() op_metrics.query_bytes = cur.query_bytes op_metrics.time_to_first_response = op_metrics.elapsed_time() for t in tuples: op_metrics.rows_returned += 1 tuple_msg = TupleMessage(Tuple(t)) records_queue.append(tuple_msg) del tuples op_metrics.bytes_scanned = cur.bytes_scanned op_metrics.bytes_processed = cur.bytes_processed op_metrics.bytes_returned = cur.bytes_returned op_metrics.time_to_first_record_response = cur.time_to_first_record_response op_metrics.time_to_last_record_response = cur.time_to_last_record_response op_metrics.timer_stop() stats[part] = op_metrics print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned))
def download_part(self, part, part_key, records_queue, stats): print('Started downloading part {} key {}'.format(part, part_key)) op_metrics = SQLTableScanMetrics() op_metrics.timer_start() cur = Cursor().select(part_key, self.s3sql) tuples = cur.execute() op_metrics.query_bytes = cur.query_bytes op_metrics.time_to_first_response = op_metrics.elapsed_time() for t in tuples: op_metrics.rows_returned += 1 tuple_msg = TupleMessage(Tuple(t)) records_queue.append(tuple_msg) del tuples op_metrics.bytes_scanned = cur.bytes_scanned op_metrics.bytes_processed = cur.bytes_processed op_metrics.bytes_returned = cur.bytes_returned op_metrics.time_to_first_record_response = cur.time_to_first_record_response op_metrics.time_to_last_record_response = cur.time_to_last_record_response op_metrics.timer_stop() stats[part] = op_metrics print('Finished downloading part {} read {} records'.format( part, op_metrics.rows_returned))
def join_field_names(self): """Examines the collected field names and joins them into a single list, left field names followed by right field names. The joined field names tuple is then sent. :return: None """ joined_field_names = [] # We can only emit field name tuples if we received tuples for both sides of the join if self.__l_field_names is not None and self.__r_field_names is not None: for field_name in self.__l_field_names: joined_field_names.append(field_name) for field_name in self.__r_field_names: joined_field_names.append(field_name) if self.log_enabled: print("{}('{}') | Sending field names [{}]".format( self.__class__.__name__, self.name, {'field_names': joined_field_names})) self.send(TupleMessage(Tuple(joined_field_names)), self.consumers)
def nested_loop(self): """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent. Allows for the loop to be broken if the operator completes while executing. :return: None """ for l_tuple in self.__l_tuples: if self.is_completed(): break for r_tuple in self.__r_tuples: if self.is_completed(): break l_field_name_index = self.__l_field_names.index(self.join_expr.l_field) r_field_name_index = self.__r_field_names.index(self.join_expr.r_field) if l_tuple[l_field_name_index] == r_tuple[r_field_name_index]: t = l_tuple + r_tuple if self.log_enabled: print("{}('{}') | Sending field values [{}]".format( self.__class__.__name__, self.name, {'data': t})) self.op_metrics.rows_joined += 1 self.send(TupleMessage(Tuple(t)), self.consumers)
def join_field_names(self): """Examines the collected field names and joins them into a single list, left field names followed by right field names. The joined field names tuple is then sent. :return: None """ joined_field_names = [] # We can only emit field name tuples if we # received tuples for both sides of the join, # we may not always get them # as some reads may return an empty record set if self.build_field_names is not None and self.tuple_field_names is not None: for field_name in self.build_field_names: joined_field_names.append(field_name) for field_name in self.tuple_field_names: joined_field_names.append(field_name) if self.log_enabled: print("{} | {}('{}') | Sending field names [{}]".format( time.time(), self.__class__.__name__, self.name, {'field_names': joined_field_names})) self.send(TupleMessage(Tuple(joined_field_names)), self.consumers)
def start(self): self.op_metrics.timer_start() if self.parts == 1: self.records, part, part_op_metrics = download_part_local(self.s3sql, 0, self.s3key, self.records, self.worker_metrics) self.worker_metrics[part] = part_op_metrics else: result_ids = [download_part_remote.remote(self.s3sql, part, self.get_part_key('sf1000-lineitem', part)) for part in range(self.parts)] for result_id, part_id, part_op_metrics_id in result_ids: res_records = ray.get(result_id) part = ray.get(part_id) part_metrics = ray.get(part_op_metrics_id) self.worker_metrics[part] = part_metrics self.records.append(res_records) print('got {} records from part {}'.format(len(res_records), part)) # self.send(msg, self.consumers) self.records = np.vstack(self.records) print("All parts finished") print('got {} records'.format(len(self.records))) for rec in self.records[0:10]: self.send(TupleMessage(Tuple(rec)), self.consumers) self.complete() self.op_metrics.timer_stop() self.print_stats(to_file=self.s3key + '.' + str(self.parts) +'.stats.txt')
def __send_field_names(self, tuple_): """Sends the field names tuple to consumers :param tuple_: The field names tuple :return: None """ self.send(TupleMessage(tuple_), self.consumers)
def __on_receive_tuple(self, tuple_, producer_name): """Event handler for a received tuple :param tuple_: The received tuple :return: None """ assert (len(tuple_) > 0) if self.field_names is None: self.field_names = tuple_ self.producers_received[producer_name] = True self.send(TupleMessage(tuple_), self.consumers) else: if producer_name not in self.producers_received.keys(): # This will be the field names tuple, skip it self.producers_received[producer_name] = True else: self.send(TupleMessage(tuple_), self.consumers)
def send_field_values(self, tuple_): """Sends a field values tuple :param tuple_: The tuple :return: None """ if self.log_enabled: print("{}('{}') | Sending field values [{}]".format( self.__class__.__name__, self.name, {'data': tuple_})) self.send(TupleMessage(Tuple(tuple_)), self.consumers)
def send_field_names(self, tuple_): """Sends the field names tuple :param tuple_: The tuple :return: None """ # Create and send the record field names lt = IndexedTuple.build_default(tuple_) labels = Tuple(lt.field_names()) if self.log_enabled: print("{}('{}') | Sending field names [{}]".format( self.__class__.__name__, self.name, {'field_names': labels})) self.send(TupleMessage(labels), self.consumers)
def join_field_values(self): """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent. Allows for the loop to be broken if the operator completes while executing. :return: None """ # Check that we actually got tuple field names to join on, we may not have as producers may not have produced # any if self.tuple_field_names is not None: outer_tuple_field_index = self.tuple_field_names.index( self.join_expr.r_field) for outer_tuple in self.tuples: if self.is_completed(): break outer_tuple_field_value = outer_tuple[outer_tuple_field_index] inner_tuples = self.hashtable.get(outer_tuple_field_value, None) # if self.log_enabled: # print("{}('{}') | Joining Outer: {} Inner: {}".format( # self.__class__.__name__, # self.name, # outer_tuple, # inner_tuples)) if inner_tuples is not None: for inner_tuple in inner_tuples: # if l_to_r: # t = outer_tuple + inner_tuple # else: t = inner_tuple + outer_tuple if self.log_enabled: print("{} | {}('{}') | Sending field values [{}]". format(time.time(), self.__class__.__name__, self.name, {'data': t})) self.op_metrics.rows_joined += 1 self.send(TupleMessage(Tuple(t)), self.consumers)
def on_receive_tuple(self, tuple_): """Handles receipt of a tuple. Field names are stored and sent. Field values are placed into a sorted heap using the sort expressions to define the sort order. :param tuple_: The received tuple :return: None """ if not self.field_names: # Collect and send field names through self.field_names = tuple_ self.send(TupleMessage(tuple_), self.consumers) else: # Store the tuple in the sorted heap sortable_t = HeapSortableTuple(tuple_, self.field_names, self.sort_expressions) heappush(self.heap, sortable_t)
def on_producer_completed(self, producer_name): """Handles the event when a producer completes. When this happens the sorted tuples are emitted. :param producer_name: The producer that completed :return: None """ # print("Sort Done | ") while self.heap: if self.is_completed(): break t = heappop(self.heap).tuple self.send(TupleMessage(t), self.consumers) del self.heap Operator.on_producer_completed(self, producer_name)
def on_receive_tuple(self, tuple_, _producer_name): if not self.field_names_index: self.field_names_index = IndexedTuple.build_field_names_index(tuple_) self.send(TupleMessage(tuple_), self.consumers) self.producers_received[_producer_name] = True else: if _producer_name not in self.producers_received.keys(): # Will be field names, skip self.producers_received[_producer_name] = True else: if self.hashtable is None: self.hashtable = {} self.op_metrics.rows_processed += 1 it = IndexedTuple(tuple_, self.field_names_index) itd = self.hashtable.setdefault(it[self.key], []) itd.append(tuple_)
def start(self): self.op_metrics.timer_start() if self.parts == 1: self.records = [] self.worker_metrics = {} self.download_part(0, self.records, self.worker_metrics) else: processes = [] for part in range(self.parts): p = Process(target=self.download_part, args=(part, self.records, self.worker_metrics)) p.start() processes.append(p) for p in processes: p.join() print("All parts finished with {} records".format(len(self.records))) first_tuple = True for msg in self.records: if first_tuple: # Create and send the record field names it = IndexedTuple.build_default(msg.tuple_) first_tuple = False if self.log_enabled: print("{}('{}') | Sending field names: {}" .format(self.__class__.__name__, self.name, it.field_names())) self.send(TupleMessage(Tuple(it.field_names())), self.consumers) self.send(msg, self.consumers) self.complete() self.op_metrics.timer_stop() self.print_stats(to_file=self.s3key + '.' + str(self.parts) +'.stats.txt') self.records[:] = []
def on_receive_tuple(self, tuple_): """Handles the receipt of a tuple. When the number of tuples reaches max it informs the producer to stop producing. This allows table scans to stop once tuples limit has been reached. It also informs any consumers that it is done producing tuples. :param tuple_: The received tuple :return: None """ if not self.first_tuple: self.current += 1 else: self.first_tuple = False if self.current <= self.max_tuples: self.send(TupleMessage(tuple_), self.consumers) elif self.current == self.max_tuples: # Set this operator to complete if not self.is_completed(): self.complete() else: pass
def execute_pandas_query(op): cur = PandasCursor(op.s3).select(op.s3key, op.s3sql) dfs = cur.execute() op.op_metrics.query_bytes = cur.query_bytes op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time() first_tuple = True for df in dfs: assert (len(df) > 0) if first_tuple: assert (len(df.columns.values) > 0) op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) first_tuple = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.rows_returned += len(df) op.send(df, op.consumers) return cur
def on_numpy_array(np_array): df = pd.DataFrame(np_array) if closure['first_tuple']: assert (len(df.columns.values) > 0) op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) closure['first_tuple'] = False if op.log_enabled: print("{}('{}') | Sending field names: {}".format( op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time( ) op.op_metrics.rows_returned += len(df) if op.log_enabled: print("{}('{}') | Sending field values:".format( op.__class__.__name__, op.name)) print(df) op.send(df, op.consumers)
def join_field_values(self): """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent. Allows for the loop to be broken if the operator completes while executing. :return: None """ # Determine which direction the hash join should run # The larger relation should remain as a list and the smaller relation should be hashed. If either of the # relations are empty then just return if len(self.l_tuples) == 0 or len(self.r_tuples) == 0: return elif len(self.l_tuples) > len(self.r_tuples): l_to_r = True # r_to_l = not l_to_r else: l_to_r = False # r_to_l = not l_to_r if l_to_r: outer_tuples_list = self.l_tuples inner_tuples_list = self.r_tuples inner_tuple_field_name = self.join_expr.r_field inner_tuple_field_names = self.r_field_names outer_tuple_field_index = self.l_field_names.index( self.join_expr.l_field) else: outer_tuples_list = self.r_tuples inner_tuples_list = self.l_tuples inner_tuple_field_name = self.join_expr.l_field inner_tuple_field_names = self.l_field_names outer_tuple_field_index = self.r_field_names.index( self.join_expr.r_field) # Hash the tuples from the smaller set of tuples inner_tuples_dict = {} for t in inner_tuples_list: it = IndexedTuple.build(t, inner_tuple_field_names) itd = inner_tuples_dict.setdefault(it[inner_tuple_field_name], []) itd.append(t) for outer_tuple in outer_tuples_list: if self.is_completed(): break outer_tuple_field_value = outer_tuple[outer_tuple_field_index] inner_tuples = inner_tuples_dict.get(outer_tuple_field_value, None) # if self.log_enabled: # print("{}('{}') | Joining Outer: {} Inner: {}".format( # self.__class__.__name__, # self.name, # outer_tuple, # inner_tuples)) if inner_tuples is not None: for inner_tuple in inner_tuples: if l_to_r: t = outer_tuple + inner_tuple else: t = inner_tuple + outer_tuple # if self.log_enabled: # print("{}('{}') | Sending field values [{}]".format( # self.__class__.__name__, # self.name, # {'data': t})) self.op_metrics.rows_joined += 1 self.send(TupleMessage(Tuple(t)), self.consumers)
def on_receive_tuple(self, tuple_, producer_name): """Handles the receipt of a tuple. The tuple is mapped to a new tuple using the given projection expressions. The field names are modified according to the new field names in the projection expressions. :param producer_name: :param tuple_: The received tuple :return: None """ assert (len(tuple_) > 0) if not self.field_names_index: self.field_names_index = IndexedTuple.build_field_names_index( tuple_) # Map the old field names to the new projected_field_names = [] for e in self.project_exprs: fn = e.new_field_name projected_field_names.append(fn) if self.log_enabled: print( "{}('{}') | Sending projected field names: from: {} to: {}" .format(self.__class__.__name__, self.name, tuple_, projected_field_names)) self.producers_received[producer_name] = True assert (len(projected_field_names) == len(self.project_exprs)) self.send(TupleMessage(Tuple(projected_field_names)), self.consumers) else: assert (len(tuple_) == len(self.field_names_index)) if producer_name not in self.producers_received.keys(): # This will be the field names tuple, skip it self.producers_received[producer_name] = True else: # Perform the projection using the given expressions it = IndexedTuple(tuple_, self.field_names_index) projected_field_values = [] for e in self.project_exprs: fv = e.expr(it) projected_field_values.append(fv) self.op_metrics.rows_projected += 1 if self.log_enabled: print( "{}('{}') | Sending projected field values: from: {} to: {}" .format(self.__class__.__name__, self.name, tuple_, projected_field_values)) assert (len(projected_field_values) == len(self.project_exprs)) self.send(TupleMessage(Tuple(projected_field_values)), self.consumers)
def execute_pandas_query(op): # if op.use_native: # cur = NativeCursor(op.fast_s3).select(op.s3key, op.s3sql) # df = cur.execute() # # op.op_metrics.query_bytes = cur.query_bytes # op.op_metrics.rows_returned += len(df) # op.op_metrics.bytes_returned += cur.bytes_returned # # op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) # op.send(df, op.consumers) # # return cur # else: dfs = op.cur.execute() op.op_metrics.query_bytes = op.cur.query_bytes op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time() first_tuple = True counter = 0 buffer_ = pd.DataFrame() for df in dfs: if first_tuple: assert (len(df.columns.values) > 0) op.send(TupleMessage(Tuple(df.columns.values)), op.consumers) first_tuple = False # if op.log_enabled: # print("{}('{}') | Sending field names: {}" # .format(op.__class__.__name__, op.name, df.columns.values)) op.op_metrics.rows_returned += len(df) # if op.log_enabled: # print("{}('{}') | Sending field values: {}".format(op.__class__.__name__, op.name, df)) counter += 1 if op.log_enabled: sys.stdout.write('.') if counter % 100 == 0: print("Rows {}".format(op.op_metrics.rows_returned)) op.send(DataFrameMessage(df), op.consumers) # buffer_ = pd.concat([buffer_, df], axis=0, sort=False, ignore_index=True, copy=False) # if len(buffer_) >= 8192: # op.send(buffer_, op.consumers) # buffer_ = pd.DataFrame() #if len(buffer_) > 0: # op.send(buffer_, op.consumers) # del buffer_ op.op_metrics.bytes_scanned = op.cur.bytes_scanned op.op_metrics.bytes_processed = op.cur.bytes_processed op.op_metrics.bytes_returned = op.cur.bytes_returned op.op_metrics.time_to_first_record_response = op.cur.time_to_first_record_response op.op_metrics.time_to_last_record_response = op.cur.time_to_last_record_response op.op_metrics.num_http_get_requests = op.cur.num_http_get_requests if not op.is_completed(): op.complete() return op.cur