def sort_orderby(self, by, sort_type): """ in_place sort for orderby Args: by: list of column names sort_type: list of True/False if ASC for each column name in 'by' i.e [True, False] means [ASC, DESC] """ # if by is None and self.identifier_column in self._frames: # by = [self.identifier_column] if sort_type is None: sort_type = [True] if by is not None: for column in by: if column not in self._frames.columns: LoggingManager().log( 'Can not orderby non-projected column: {}'.format( column), LoggingLevel.ERROR) raise KeyError( 'Can not orderby non-projected column: {}'.format( column)) self._frames.sort_values(by, ascending=sort_type, ignore_index=True, inplace=True) else: LoggingManager().log( 'Columns and Sort Type are required for orderby', LoggingLevel.WARNING)
def handle_request(transport, request_message): """ Reads a request from a client and processes it If user inputs 'quit' stops the event loop otherwise just echoes user input """ LoggingManager().log('Receive request: --|' + str(request_message) + '|--') try: output_batch = execute_query_fetch_all(request_message) except Exception as e: LoggingManager().log(e, LoggingLevel.WARNING) output_batch = Batch(pd.DataFrame([{'error': str(e)}])) response = Response(status=ResponseStatus.FAIL, batch=output_batch) else: response = Response(status=ResponseStatus.SUCCESS, batch=output_batch) responseData = response.to_json() # Send data length, because response can be very large data = (str(len(responseData)) + '|' + responseData).encode('ascii') LoggingManager().log('Response to client: --|' + str(response) + '|--\n' + 'Length: ' + str(len(responseData))) transport.write(data) return response
def start_clients(client_count: int, host: string, port: int, loop, stop_clients_future): """ Start a set of eva clients client_count: number of clients (= connections) hostname: hostname of the server port: port where the server is running stop_clients_future: future for externally stopping the clients """ LoggingManager().log('PID(' + str(os.getpid()) + ') attempting ' + str(client_count) + ' connections') # Get a reference to the event loop # loop = asyncio.get_event_loop() max_retry_count = 3 # Create client tasks client_coros = [ start_client(loop, lambda: EvaClient(), host, port, max_retry_count) for i in range(client_count) ] # Start a set of clients clients = loop.create_task( asyncio.wait( [loop.create_task(client_coro) for client_coro in client_coros])) try: stop_clients_future = asyncio.wait([clients]) loop.run_until_complete(stop_clients_future) except KeyboardInterrupt: LoggingManager().log("client process interrupted") finally: LoggingManager().log("client process shutdown") # tasks, exceptions, retries summary = [0, 0, 0] if clients.done(): done, _ = clients.result() exceptions = sum(1 for d in done if d.exception()) retries = sum(max_retry_count - d.result() for d in done if not d.exception()) tasks = len(client_coros) LoggingManager().log( str(tasks) + ' tasks, ' + str(exceptions) + ' exceptions, ' + str(retries) + ' retries') summary = [tasks, exceptions, retries] # Close loop loop.close() return summary
def handle_request(transport, request_message): """ Reads a request from a client and processes it If user inputs 'quit' stops the event loop otherwise just echoes user input """ LoggingManager().log('Receive request: --|' + str(request_message) + '|--') output_batch = None response = None try: stmt = Parser().parse(request_message)[0] l_plan = StatementToPlanConvertor().visit(stmt) p_plan = PlanGenerator().build(l_plan) output_batch = PlanExecutor(p_plan).execute_plan() except Exception as e: LoggingManager().log(e, LoggingLevel.WARNING) response = Response(status=ResponseStatus.FAIL, batch=None) if response is None: response = Response(status=ResponseStatus.SUCCESS, batch=output_batch) responseData = response.to_json() # Send data length, because response can be very large data = (str(len(responseData)) + '|' + responseData).encode('ascii') LoggingManager().log('Response to client: --|' + str(response) + '|--\n' + 'Length: ' + str(len(responseData))) transport.write(data) return response
def start_server(host: string, port: int, loop, socket_timeout: int, stop_server_future): """ Start the server. Server objects are asynchronous context managers. hostname: hostname of the server stop_server_future: future for externally stopping the server """ LoggingManager().log('Start Server', LoggingLevel.CRITICAL) # Register signal handler def raiseSystemExit(_, __): raise SystemExit signals = [SIGINT, SIGTERM, SIGHUP, SIGUSR1] for handled_signal in signals: signal(handled_signal, raiseSystemExit) # Get a reference to the event loop # loop = asyncio.get_event_loop() # Start the eva server coro = loop.create_server(lambda: EvaServer(socket_timeout), host, port) server = loop.run_until_complete(coro) for socket in server.sockets: LoggingManager().log( 'PID(' + str(os.getpid()) + ') serving on ' + str(socket.getsockname()), LoggingLevel.CRITICAL) server_closed = loop.create_task(server.wait_closed()) # Start the realtime status monitor monitor = loop.create_task(realtime_server_status(EvaServer, server_closed)) try: loop.run_until_complete(stop_server_future) except KeyboardInterrupt: LoggingManager().log("Server process interrupted") finally: # Stop monitor monitor.cancel() # Close server server.close() # Stop event loop loop.run_until_complete(server.wait_closed()) loop.close() LoggingManager().log("Successfully shutdown server.")
def init_db(): """Create database if doesn't exist and create all tables.""" engine = SQLConfig().engine if not database_exists(engine.url): LoggingManager().log("Database does not exist, creating database.", LoggingLevel.INFO) create_database(engine.url) LoggingManager().log("Creating tables", LoggingLevel.INFO) BaseModel.metadata.create_all()
def connection_made(self, transport): self.transport = transport if not set_socket_io_timeouts(self.transport, 60, 0): self.transport.abort() LoggingManager().log("[ " + str(self.id) + " ]" + " Could not set timeout") return LoggingManager().log("[ " + str(self.id) + " ]" + " Connected to server")
def data_received(self, data): request_message = data.decode() LoggingManager().log('Request from client: --|' + str(request_message) + '|--') if request_message in ["quit", "exit"]: LoggingManager().log('Close client socket') return self.transport.close() else: LoggingManager().log('Handle request') asyncio.create_task(handle_request(self.transport, request_message))
def visitCreateUdf(self, ctx: evaql_parser.CreateUdfContext): udf_name = None if_not_exists = False input_definitions = [] output_definitions = [] impl_path = None udf_type = None for child in ctx.children: try: if isinstance(child, TerminalNode): continue rule_idx = child.getRuleIndex() if rule_idx == evaql_parser.RULE_udfName: udf_name = self.visit(ctx.udfName()) elif rule_idx == evaql_parser.RULE_ifNotExists: if_not_exists = True elif rule_idx == evaql_parser.RULE_createDefinitions: # There should be 2 createDefinition # idx 0 describing udf INPUT # idx 1 describing udf OUTPUT if len(ctx.createDefinitions()) != 2: LoggingManager().log('UDF Input or Output Missing', LoggingLevel.ERROR) input_definitions = self.visit(ctx.createDefinitions(0)) output_definitions = self.visit(ctx.createDefinitions(1)) elif rule_idx == evaql_parser.RULE_udfType: udf_type = self.visit(ctx.udfType()) elif rule_idx == evaql_parser.RULE_udfImpl: impl_path = self.visit(ctx.udfImpl()).value except BaseException: LoggingManager().log('CREATE UDF Failed', LoggingLevel.ERROR) # stop parsing something bad happened return None stmt = CreateUDFStatement( udf_name, if_not_exists, input_definitions, output_definitions, impl_path, udf_type) return stmt
def _shutdown_catalog(self): """ This method is responsible for gracefully shutting the catalog manager. Currently, it includes dropping the catalog database """ LoggingManager().log("Shutting catalog", LoggingLevel.INFO) drop_db()
def init_spark_session(self, application_name, spark_master=None): """Setup a spark session. :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :return: spark_session: A spark session """ eva_spark_conf = SparkConf() pyspark_config = self._config.get_value('pyspark', 'property') for key, value in pyspark_config.items(): eva_spark_conf.set(key, value) session_builder = SparkSession \ .builder \ .appName(application_name) \ .config(conf=eva_spark_conf) if spark_master: session_builder.master(spark_master) # Gets an existing SparkSession or, # if there is no existing one, creates a new one based # on the options set in this builder. self._session = session_builder.getOrCreate() # Configure logging log4j_level = LoggingManager().getLog4JLevel() spark_context = self._session.sparkContext spark_context.setLogLevel(log4j_level)
async def realtime_server_status(protocol, server_closed): """ Report status changes. `protocol` must provide `connections` and `errors` attributes. Completion or cancellation of the `server_closed` future stops monitoring. """ previous_connections = 0 previous_errors = 0 while not server_closed.done() and not server_closed.cancelled(): # Only report changes if protocol.__connections__ != previous_connections or \ protocol.__errors__ != previous_errors: previous_connections = protocol.__connections__ previous_errors = protocol.__errors__ LoggingManager().log( "Status: " + "connections: " + str(previous_connections) + " " + "errors: " + str(previous_errors), LoggingLevel.INFO) # Report changes every 1~s await asyncio.sleep(1)
def get_petastorm_column(df_column): column_type = df_column.type column_name = df_column.name column_is_nullable = df_column.is_nullable column_array_dimensions = df_column.array_dimensions # Reference: # https://github.com/uber/petastorm/blob/master/petastorm/ # tests/test_common.py petastorm_column = None if column_type == ColumnType.INTEGER: petastorm_column = UnischemaField(column_name, np.int32, (), ScalarCodec(IntegerType()), column_is_nullable) elif column_type == ColumnType.FLOAT: petastorm_column = UnischemaField(column_name, np.float64, (), ScalarCodec(FloatType()), column_is_nullable) elif column_type == ColumnType.TEXT: petastorm_column = UnischemaField(column_name, np.str_, (), ScalarCodec(StringType()), column_is_nullable) elif column_type == ColumnType.NDARRAY: petastorm_column = UnischemaField(column_name, np.uint8, column_array_dimensions, NdarrayCodec(), column_is_nullable) else: LoggingManager().log("Invalid column type: " + str(column_type), LoggingLevel.ERROR) return petastorm_column
def init_spark_session(self, application_name, spark_master=None): """Setup a spark session. :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :return: spark_session: A spark session """ eva_spark_conf = SparkConf() eva_spark_conf.set('spark.logConf', 'true') # enable Arrow optimization for spark Session # This is added to help with to and fro conversion # between pandas and spark dataframe # https://docs.databricks.com/spark/latest/spark-sql/spark-pandas.html eva_spark_conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true') session_builder = SparkSession \ .builder \ .appName(application_name) \ .config(conf=eva_spark_conf) if spark_master: session_builder.master(spark_master) # Gets an existing SparkSession or, # if there is no existing one, creates a new one based # on the options set in this builder. self._session = session_builder.getOrCreate() # Configure logging log4j_level = LoggingManager().getLog4JLevel() spark_context = self._session.sparkContext spark_context.setLogLevel(log4j_level)
def visit_select(self, statement: SelectStatement): """converter for select statement Arguments: statement {SelectStatement} -- [input select statement] """ video = statement.from_table if video is None: LoggingManager().log('From entry missing in select statement', LoggingLevel.ERROR) return None if isinstance(video, SelectStatement): # NestedQuery self.visit_select(video) child_plan = self._plan self._plan = LogicalQueryDerivedGet() self._plan.append_child(child_plan) elif isinstance(video, TableRef): # Table self.visit_table_ref(video) # Filter Operator predicate = statement.where_clause if predicate is not None: self._visit_select_predicate(predicate) # Projection operator select_columns = statement.target_list # ToDO # add support for SELECT STAR if select_columns is not None: self._visit_projection(select_columns)
def get_table_bindings(self, database_name: str, table_name: str, column_names: List[str] = None) -> Tuple[int, List[int]]: """This method fetches bindings for strings. Args: database_name: currently not in use table_name: the table that is being referred to column_names: the column names of the table for which bindings are required Returns: returns metadata_id of table and a list of column ids """ metadata_id = self._dataset_service.dataset_by_name(table_name) column_ids = [] if column_names is not None: if not isinstance(column_names, list): LoggingManager().log( "CatalogManager::get_table_binding() expected list", LoggingLevel.WARNING) column_ids = self._column_service.columns_by_dataset_id_and_names( metadata_id, column_names) return metadata_id, column_ids
def execute(self): implementation_rules = RulesManager().implementation_rules valid_rules = [] for rule in implementation_rules: if rule.top_match(self.root_expr.opr): valid_rules.append(rule) sorted(valid_rules, key=lambda x: x.promise(), reverse=True) for rule in valid_rules: binder = Binder(self.root_expr, rule.pattern, self.optimizer_context.memo) for match in iter(binder): if not rule.check(match, self.optimizer_context): continue LoggingManager().log( 'In Optimize physical expression,' 'Rule {} matched for {}'.format(rule, self.root_expr), LoggingLevel.INFO) after = rule.apply(match, self.optimizer_context) new_expr = GroupExpression(after, self.root_expr.group_id, self.root_expr.children) # LoggingManager().log('After rewiting {}'.format(new_expr), # LoggingLevel.INFO) self.optimizer_context.memo.add_group_expr(new_expr) # Optimize inputs for this physical expr self.optimizer_context.task_stack.push( OptimizeInputs(new_expr, self.optimizer_context)) # Optimize the child groups for child_id in self.root_expr.children: self.optimizer_context.task_stack.push( OptimizeGroup(child_id, self.optimizer_context))
def test_interaction(self): host = "0.0.0.0" port = 5432 client_count = 1 LoggingManager().setEffectiveLevel(LoggingLevel.DEBUG) def timeout_server(): # need a more robust mechanism for when to cancel the future time.sleep(2) self.stop_clients_future.cancel() thread = threading.Thread(target=timeout_server) thread.daemon = True thread.start() summary = start_clients(client_count=client_count, host=host, port=port, loop=self.loop, stop_clients_future=self.stop_clients_future) self.assertEqual(summary[0], client_count) exception_count = 0 self.assertEqual(summary[1], exception_count)
def create_column_metadata(col_list: List[ColumnDefinition]): """Create column metadata for the input parsed column list. This function will not commit the provided column into catalog table. Will only return in memory list of ColumnDataframe objects Arguments: col_list {List[ColumnDefinition]} -- parsed col list to be created """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: LoggingManager().log( "Empty column while creating column metadata", LoggingLevel.ERROR) result_list.append(col) result_list.append( CatalogManager().create_column_metadata( col.name, col.type, col.array_type, col.dimension ) ) return result_list
def load(self): video = cv2.VideoCapture(self.video_metadata.file) video_start = self.offset if self.offset else 0 video.set(cv2.CAP_PROP_POS_FRAMES, video_start) LoggingManager().log("Loading frames", LoggingLevel.CRITICAL) _, frame = video.read() frame_ind = video_start - 1 info = None if frame is not None: (height, width, num_channels) = frame.shape info = FrameInfo(height, width, num_channels, ColorSpace.BGR) frames = [] while frame is not None: frame_ind += 1 eva_frame = Frame(frame_ind, frame, info) if self.skip_frames > 0 and frame_ind % self.skip_frames != 0: _, frame = video.read() continue frames.append(eva_frame) if self.limit and frame_ind >= self.limit: return FrameBatch(frames, info) if len(frames) % self.batch_size == 0: yield FrameBatch(frames, info) frames = [] _, frame = video.read() if frames: return FrameBatch(frames, info)
def column_definition_to_udf_io( col_list: List[ColumnDefinition], is_input: bool): """Create the UdfIO object fro each column definition provided Arguments: col_list(List[ColumnDefinition]): parsed input/output definitions is_input(bool): true if input else false """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: LoggingManager().log( "Empty column definition while creating udf io", LoggingLevel.ERROR) result_list.append(col) result_list.append( CatalogManager().udf_io(col.name, col.type, array_type=col.array_type, dimensions=col.dimension, is_input=is_input) ) return result_list
def delete_column(cls, column_list): try: for column in column_list: column.delete() except Exception: LoggingManager().log("detele column failed", LoggingLevel.ERROR) raise Exception return None
def data_received(self, data): response_chunk = data.decode() LoggingManager().log("[ " + str(self.id) + " ]" + " Response from server: --|" + str(response_chunk) + "|--") self._response_chunk = response_chunk
def __init__(self): self.done = asyncio.Future() self.transport = None self.id = EvaClient.__connections__ EvaClient.__connections__ += 1 LoggingManager().log("[ " + str(self.id) + " ]" + " Init Client")
def bootstrap_catalog(self): # eva_dir = ConfigurationManager().get_value("core", "location") # output_url = os.path.join(eva_dir, CATALOG_DIR) # LoggingManager().log("Bootstrapping catalog" + str(output_url), # LoggingLevel.INFO) LoggingManager().log("Bootstrapping catalog", LoggingLevel.INFO) init_db()
def execute(self): """We apply rewrite rules in a top down fashion. Right now we are applying rules aggressively. Later when we have more rules it might be a better idea to push optimization task to a queue. """ rewrite_rules = RulesManager().rewrite_rules valid_rules = [] for rule in rewrite_rules: if not self.root_expr.is_rule_explored(rule.rule_type) and \ rule.top_match(self.root_expr.opr): valid_rules.append(rule) # sort the rules by promise valid_rules = sorted(valid_rules, key=lambda x: x.promise(), reverse=True) for rule in valid_rules: binder = Binder(self.root_expr, rule.pattern, self.optimizer_context.memo) for match in iter(binder): if not rule.check(match, self.optimizer_context): continue self.root_expr.mark_rule_explored(rule.rule_type) LoggingManager().log( 'In TopDown, Rule {} matched for {}'.format( rule, self.root_expr), LoggingLevel.INFO) after = rule.apply(match, self.optimizer_context) new_expr = self.optimizer_context.xform_opr_to_group_expr( opr=after, root_group_id=self.root_expr.group_id, is_root=True, copy_opr=False) self.root_expr = new_expr LoggingManager().log( 'After rewiting {}'.format(self.root_expr), LoggingLevel.INFO) self.optimizer_context.task_stack.push( TopDownRewrite(self.root_expr, self.optimizer_context)) for child in self.root_expr.children: child_expr = self.optimizer_context.memo.groups[child] \ .logical_exprs[0] self.optimizer_context.task_stack.push( TopDownRewrite(child_expr, self.optimizer_context))
def exec(self) -> Iterator[Batch]: if self.node.all is False: LoggingManager().log('Only UNION ALL is supported now.', LoggingLevel.WARNING) # We should have only two children for child in self.children: for batch in child.exec(): yield batch
def frames(self, values): if isinstance(values, DataFrame): self._frames = values[sorted(values.columns)] else: LoggingManager().log('Batch constructor not properly called!', LoggingLevel.DEBUG) raise ValueError('Batch constructor not properly called. \ Expected pandas.DataFrame') self._batch_size = len(values)
def connection_lost(self, exc, exc2=None): LoggingManager().log("[ " + str(self.id) + " ]" + " Disconnected from server") try: self.transport.abort() # free sockets early, free sockets often self.transport = None except Exception as e: LoggingManager().exception(e) exc2 = e finally: if exc or exc2: EvaClient.__errors__ += 1 self.done.set_exception(exc or exc2) self.done.exception() # remove _tb_logger else: self.done.set_result(None)
def delete(self): """Delete and commit""" try: db_session.delete(self) self._commit() except Exception: LoggingManager().log("Object couldn't be deleted", LoggingLevel.ERROR) raise Exception