def __init__(self, ingestion_id: str, ingestion_type: str, request_content: str): self.status = GiraffeEventType.STARTED_PROCESSING_REQUEST self.map_status_to_timestamp: Dict[str, str] = {} self.id = ingestion_id self.log = log_helper.get_logger(logger_name=__name__) self.request_body = request_content self.start_timestamp = datetime.now() self.start_time_unix = self.now_as_unix_timestamp() self.end_timestamp = None self.end_time_unix = None self.map_source_to_model = {} self.map_redis_key_to_cardinality = {} self.map_redis_key_to_processed_amount = {} self.counters = {} self.finished_keys = [] self.errors = [] self.set_status(status=GiraffeEventType.STARTED_PROCESSING_REQUEST) self.log.info( f'Processing request-id: {ingestion_id} ({ingestion_type}) {request_content}' ) self.log.admin({ Field.request_id: ingestion_id, Field.request_type: ingestion_type, Field.request: request_content })
def __init__(self, event_dispatcher: EventDispatcher, config=config_helper.get_config()): self.config = config self.event_dispatcher = event_dispatcher self.log = log_helper.get_logger(logger_name=f'{self.__class__.__name__}_{threading.current_thread().name}') # Connecting py2neo self.graph = Graph( uri=config.neo_host_address, user=config.neo_username, password=config.neo_password ) # Connecting official bolt-driver self._driver = GraphDatabase.driver(uri=config.neo_bolt_uri, auth=(config.neo_username, config.neo_password)) try: db_kernel_start = self.graph.database.kernel_start_time except ServiceUnavailable as _: raise TechnicalError(f'Neo4j does not seem to be active at {config.neo_host_address}') self.log.debug(f'Neo4j is active since {db_kernel_start}.') atexit.register(self._driver.close) self.indices_cache_label_property: List[Tuple[str, str]] = []
def __init__(self, event_dispatcher: EventDispatcher, config=config_helper.get_config()): self.lock = threading.Lock() self.config = config self.log = log_helper.get_logger(logger_name=__name__) self.log.debug('Progress-Monitor started.') self.all_tasks: Dict[str, IngestionRequest] = {} event_dispatcher.register_callback(callback=self.on_giraffe_event)
def __init__(self, mode: CommunicatorMode, host: str = 'localhost', port: int = 65432): self.lock = threading.Lock() self._host = host self._port = port self._mode = mode self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if self._mode == CommunicatorMode.CLIENT: self._is_client = True self.socket.settimeout(30) # seconds self.log = log_helper.get_logger(logger_name='Client-Communicator') self.buffer = bytearray else: self._is_client = False self.thread_pool = ThreadPoolExecutor(max_workers=2) self.listeners = [] self.log = log_helper.get_logger(logger_name='Server-Communicator') atexit.register(self.stop) self.__start()
def __init__(self, monitoring_host: str = 'localhost', monitoring_port: int = 65432): self.listeners: List[Callable] = [] self.log = log_helper.get_logger(logger_name='Event-Dispatcher') self.tcp_server: Communicator = Communicator( mode=CommunicatorMode.SERVER, host=monitoring_host, port=monitoring_port)
def __init__(self, event_dispatcher: EventDispatcher, env: EnvProvider): self.log = log_helper.get_logger( logger_name= f'{self.__class__.__name__}_{threading.current_thread().name}') self.log.debug(f'Initialising redis driver.') self.config = env.config self.driver: Redis = redis.StrictRedis( host=self.config.redis_host_address, port=self.config.redis_port, decode_responses=True) self.spark_helper: SparkHelper = env.spark_helper self.event_dispatcher = event_dispatcher atexit.register(self.driver.close)
def __init__(self, env: EnvProvider, multi_helper: MultiHelper, event_dispatcher: EventDispatcher): self.is_ready = False self.event_dispatcher = event_dispatcher self.log = log_helper.get_logger(logger_name=self.__class__.__name__) self.config = env.config try: self.neo_db: NeoDB = NeoDB(config=self.config, event_dispatcher=self.event_dispatcher) self.redis_db: RedisDB = RedisDB( env=env, event_dispatcher=self.event_dispatcher) self.multi_helper: MultiHelper = multi_helper self.is_ready = True except Exception as the_exception: self.log.error(the_exception, exc_info=True) self.is_ready = False IngestionManager.supported_operations = ( self.config.nodes_ingestion_operation, self.config.edges_ingestion_operation)
def test_delete_nodes_by_property(config_helper, neo, logger): prepare_neo(config_helper=config_helper, log=logger, neo=neo) log: Logger = log_helper.get_logger(logger_name='Timing...') db: neo_db.NeoDB = neo test_label = config_helper.test_labels[0] sleep(3) query = f"MATCH(n:{test_label}) return count(n) as count" before_deletion_count = db.pull_query(query=query).value()[0] log.info(f'Before deletion there are {before_deletion_count} items.') timer = Timer() timer.start() result: dict = db.delete_nodes_by_properties(label=test_label, property_name_value_tuples=[('name', 'Arafat')]) sec = timer.stop() log.info(f'Time elapsed: {sec} seconds') after_deletion_count = db.pull_query(query=query).value()[0] log.info(f'After deletion there are {after_deletion_count} items.') deleted_count = result['total'] assert deleted_count > 0 assert before_deletion_count - after_deletion_count == deleted_count
def __init__(self, env: EnvProvider, progress_monitor: ProgressMonitor, event_dispatcher: EventDispatcher): self.event_dispatcher = event_dispatcher self.progress_monitor = progress_monitor self.is_ready = False self.log = log_helper.get_logger(logger_name=__name__) # noinspection PyBroadException try: self.redis_db = RedisDB(event_dispatcher=self.event_dispatcher, env=env) self.data_and_model_provider = env.data_and_model_provider self.thread_pool = ThreadPool() self.data_to_graph_translator = env.data_to_graph_entities_provider self.multi_helper = MultiHelper(config=env.config) self.im = IngestionManager(env=env, multi_helper=self.multi_helper, event_dispatcher=self.event_dispatcher) self.is_ready = True except Exception as the_exception: self.log.error(the_exception, exc_info=True) self.is_ready = False
def __init__(self, cmd_line_args=None): self.log = log_helper.get_logger(logger_name=__name__) configuration_ini_file_path = None if cmd_line_args is None else cmd_line_args[0].config_ini if configuration_ini_file_path is None: self.config = config_helper.get_config() else: config_ini_file = configuration_ini_file_path validate_is_file(file_path=config_ini_file) self.config = config_helper.get_config(configurations_ini_file_path=config_ini_file) # ------- self.logging_file_path = os.path.join(self.config.logs_storage_folder, 'giraffe.log') file_handler = RotatingFileHandler(filename=self.logging_file_path, mode='a', maxBytes=1_000_000, backupCount=3, encoding='utf-8', delay=False) file_handler.setFormatter(log_helper.log_row_format) log_helper.add_handler(handler=file_handler) atexit.register(log_helper.stop_listener) # ------- self.execution_env = self.config.execution_environment # ------- if self.execution_env == 'dev': self.data_and_model_provider: DataAndModelProvider = MockDataAndModelProvider() self.data_to_graph_entities_provider: DataToGraphEntitiesProvider = MockDataToGraphEntitiesProvider() self.spark_helper = DevSparkHelper(config=self.config) elif self.execution_env == 'cortex': raise NotImplementedError('Implemented in cortex') else: self.log.info(f'Unexpected value in configuration file for execution_environment: {self.execution_env}') sys.exit(1)
from waitress import serve coordinator: Coordinator progress_monitor: ProgressMonitor if __name__ == '__main__': parser = argparse.ArgumentParser(description='Front Desk') parser.add_argument('--config_ini', type=str, required=False, default=None) args = parser.parse_known_args() env = EnvProvider(cmd_line_args=args) event_dispatcher = EventDispatcher() log = log_helper.get_logger(__name__) log.info(f'Execution environment: {env.execution_env}') not_acceptable_error_code = 406 log.debug('Initializing coordinator module.') progress_monitor = ProgressMonitor(event_dispatcher=event_dispatcher, config=env.config) atexit.register(progress_monitor.dump_and_clear_memory) coordinator = Coordinator(env=env, progress_monitor=progress_monitor, event_dispatcher=event_dispatcher) if not coordinator.is_ready: log.error('Failed initializing coordinator component - aborting.') sys.exit(-1)
def logger() -> Logger: return log_helper.get_logger('Testing-Suite', )
def __init__(self, handlers: List[DbLogHandler]): self.map_job_id_to_name = {} self.handlers = handlers self.log = log_helper.get_logger(logger_name=self.__class__.__name__)
def __init__(self, config: ConfigHelper): self.config = config self.thread_executor = ThreadPoolExecutor( max_workers=config.thread_pool_size) self.log = log_helper.get_logger(logger_name='Multi-Helper') self.futures: List[Future] = []
def __init__( self, configurations_ini_file_path: str = default_configurations_file): self.log = log_helper.get_logger(logger_name=__name__) validate_is_file(file_path=configurations_ini_file_path) neo4j_section = 'NEO4J' redis_section = 'REDIS' testing_section = 'TESTING' giraffe_section = 'GIRAFFE' general_section = 'GENERAL' spark_section = 'SPARK' elastic_section = 'ELASTICSEARCH' config_file_path = os.path.abspath(configurations_ini_file_path) self.log.info(f'Configuration file: {config_file_path}') self.config = configparser.ConfigParser() self.config.read(configurations_ini_file_path) self.log.debug( f'Found the following configuration sections: {self.config.sections()}' ) # General settings if self.config.has_section(section=neo4j_section): self.string_encoding = self.config[general_section][ 'string_encoding'] else: self.log.warning( f'No configuration found for section {general_section}') # Reading Neo4j connection details from configuration-file if self.config.has_section(section=neo4j_section): self.neo_host_address = self.config[neo4j_section]['HOST'] self.neo_username = self.config[neo4j_section]['USERNAME'] self.neo_password = self.config[neo4j_section]['PASSWORD'] self.neo_bolt_port = self.config[neo4j_section]['BOLT_PORT'] else: self.log.warning( f'No configuration found for section {neo4j_section}') self.neo_bolt_uri = f'bolt://{self.neo_host_address}:{self.neo_bolt_port}' if self.config.has_section(section=redis_section): # Reading REDIS connection details from configuration-file self.redis_host_address = self.config[('%s' % redis_section)]['HOST'] self.redis_username = self.config[redis_section]['USERNAME'] self.redis_password = self.config[redis_section]['PASSWORD'] self.redis_port = self.config[redis_section]['PORT'] self.redis_stream_milliseconds_block = self.config[redis_section][ 'STREAM_BLOCK_MILLISECONDS'] else: self.log.warning( f'No configuration found for section {redis_section}') if self.config.has_section(section=testing_section): # Unit-Testing settings self.test_labels = self.config[testing_section][ 'test_labels'].split(',') self.test_edge_type = self.config[testing_section][ 'test_edge_type'] self.test_property = self.config[testing_section]['test_property'] self.number_of_test_nodes = int( self.config[testing_section]['number_of_test_nodes']) self.number_of_test_edges = int( self.config[testing_section]['number_of_test_edges']) self.test_chunk_size = int( self.config[testing_section]['test_chunk_size']) self.test_job_name = self.config[testing_section][ 'test_request_name'] self.test_elasticsearch_index = self.config[testing_section][ 'test_elasticsearch_index'] self.test_redis_table_prefix = self.config[testing_section][ 'test_redis_table_prefix'] self.test_redis_stream_name = self.config[testing_section][ 'test_redis_stream_name'] self.test_front_desk_address = self.config[testing_section][ 'test_front_desk_address'] else: self.log.warning( f'No configuration found for section {testing_section}') if self.config.has_section(section=giraffe_section): # Giraffe logic self.nodes_ingestion_operation = self.config[giraffe_section][ 'nodes_ingestion_operation'] self.edges_ingestion_operation = self.config[giraffe_section][ 'edges_ingestion_operation'] self.key_separator = self.config[giraffe_section]['key_separator'] self.uid_property = self.config[giraffe_section][ 'unique_identifier_property_name'] self.from_uid_property = self.config[giraffe_section][ 'from_uid_property_name'] self.to_uid_property = self.config[giraffe_section][ 'to_uid_property_name'] self.edge_type_property = self.config[giraffe_section][ 'edge_type_property_name'] self.deletion_batch_size = self.config[giraffe_section][ 'deletion_batch_size'] self.model_vertex_id_prop_name = self.config[giraffe_section][ 'model_vertex_id_prop_name'] self.model_edge_id_prop_name = self.config[giraffe_section][ 'model_edge_id_prop_name'] self.edge_to_identifier_name = self.config[giraffe_section][ 'edge_to_identifier_name'] self.inception_models_rest_address = self.config[giraffe_section][ 'inception_models_rest_address'] self.inception_data_sources_rest_address = self.config[ giraffe_section]['inception_data_sources_rest_address'] self.data_source_parts_separator = self.config[giraffe_section][ 'data_source_parts_separator'] self.expected_number_of_source_parts = int( self.config[giraffe_section] ['expected_number_of_source_parts']) self.front_desk_port = int( self.config[giraffe_section]['front_desk_port']) self.redis_stream_name = self.config[giraffe_section][ 'redis_stream_name'] self.ingestion_endpoint = self.config[giraffe_section][ 'ingestion_endpoint'].strip() self.redis_get_all_endpoint = self.config[giraffe_section][ 'redis_get_all_endpoint'].strip() self.request_mandatory_field_names = eval( self.config[giraffe_section] ['request_type_mandatory_field_name']) self.logs_storage_folder = self.config[giraffe_section][ 'logs_storage_folder'] self.progress_monitor_dump_folder = self.config[giraffe_section][ 'progress_monitor_dump_folder'] self.admin_db_table_name = self.config[giraffe_section][ 'admin_db_table_name'] self.required_request_fields: Dict[str, Dict] = eval( self.config[giraffe_section]['required_request_fields']) self.hash_uid_column = eval( self.config[giraffe_section]['hash_uid_column']) self.front_desk_ip = self.config[giraffe_section]['front_desk_ip'] self.execution_environment = self.config[giraffe_section][ 'execution_environment'] self.logs_structured_prefix = self.config[giraffe_section][ 'logs_structured_prefix'] self.logs_fluentd_host = self.config[giraffe_section][ 'logs_fluentd_host'] self.logs_fluentd_port = int( self.config[giraffe_section]['logs_fluentd_port']) self.thread_pool_size = int( self.config[giraffe_section]['thread_pool_size']) self.property_names_to_index = eval( self.config[giraffe_section]['property_names_to_index']) else: self.log.warning( f'No configuration found for section {giraffe_section}') if self.config.has_section(section=spark_section): # Spark logic self.external_jars_folder = self.config[spark_section][ 'external_jars'] self.spark_app_name = self.config[spark_section]['app_name'] else: self.log.warning( f'No configuration found for section {spark_section}') if self.config.has_section(section=elastic_section): # ElasticSearch logic self.es_host_address = self.config[elastic_section]['HOST'] else: self.log.warning( f'No configuration found for section {elastic_section}')