Ejemplo n.º 1
0
    def __init__(self, ingestion_id: str, ingestion_type: str,
                 request_content: str):
        self.status = GiraffeEventType.STARTED_PROCESSING_REQUEST
        self.map_status_to_timestamp: Dict[str, str] = {}
        self.id = ingestion_id

        self.log = log_helper.get_logger(logger_name=__name__)

        self.request_body = request_content
        self.start_timestamp = datetime.now()
        self.start_time_unix = self.now_as_unix_timestamp()
        self.end_timestamp = None
        self.end_time_unix = None
        self.map_source_to_model = {}
        self.map_redis_key_to_cardinality = {}
        self.map_redis_key_to_processed_amount = {}
        self.counters = {}

        self.finished_keys = []
        self.errors = []

        self.set_status(status=GiraffeEventType.STARTED_PROCESSING_REQUEST)
        self.log.info(
            f'Processing request-id: {ingestion_id} ({ingestion_type}) {request_content}'
        )
        self.log.admin({
            Field.request_id: ingestion_id,
            Field.request_type: ingestion_type,
            Field.request: request_content
        })
Ejemplo n.º 2
0
    def __init__(self, event_dispatcher: EventDispatcher, config=config_helper.get_config()):
        self.config = config
        self.event_dispatcher = event_dispatcher
        self.log = log_helper.get_logger(logger_name=f'{self.__class__.__name__}_{threading.current_thread().name}')

        # Connecting py2neo

        self.graph = Graph(
                uri=config.neo_host_address,
                user=config.neo_username,
                password=config.neo_password
        )

        # Connecting official bolt-driver

        self._driver = GraphDatabase.driver(uri=config.neo_bolt_uri,
                                            auth=(config.neo_username, config.neo_password))

        try:
            db_kernel_start = self.graph.database.kernel_start_time
        except ServiceUnavailable as _:
            raise TechnicalError(f'Neo4j does not seem to be active at {config.neo_host_address}')
        self.log.debug(f'Neo4j is active since {db_kernel_start}.')

        atexit.register(self._driver.close)

        self.indices_cache_label_property: List[Tuple[str, str]] = []
Ejemplo n.º 3
0
 def __init__(self,
              event_dispatcher: EventDispatcher,
              config=config_helper.get_config()):
     self.lock = threading.Lock()
     self.config = config
     self.log = log_helper.get_logger(logger_name=__name__)
     self.log.debug('Progress-Monitor started.')
     self.all_tasks: Dict[str, IngestionRequest] = {}
     event_dispatcher.register_callback(callback=self.on_giraffe_event)
Ejemplo n.º 4
0
 def __init__(self, mode: CommunicatorMode, host: str = 'localhost', port: int = 65432):
     self.lock = threading.Lock()
     self._host = host
     self._port = port
     self._mode = mode
     self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     if self._mode == CommunicatorMode.CLIENT:
         self._is_client = True
         self.socket.settimeout(30)  # seconds
         self.log = log_helper.get_logger(logger_name='Client-Communicator')
         self.buffer = bytearray
     else:
         self._is_client = False
         self.thread_pool = ThreadPoolExecutor(max_workers=2)
         self.listeners = []
         self.log = log_helper.get_logger(logger_name='Server-Communicator')
     atexit.register(self.stop)
     self.__start()
Ejemplo n.º 5
0
 def __init__(self,
              monitoring_host: str = 'localhost',
              monitoring_port: int = 65432):
     self.listeners: List[Callable] = []
     self.log = log_helper.get_logger(logger_name='Event-Dispatcher')
     self.tcp_server: Communicator = Communicator(
         mode=CommunicatorMode.SERVER,
         host=monitoring_host,
         port=monitoring_port)
Ejemplo n.º 6
0
 def __init__(self, event_dispatcher: EventDispatcher, env: EnvProvider):
     self.log = log_helper.get_logger(
         logger_name=
         f'{self.__class__.__name__}_{threading.current_thread().name}')
     self.log.debug(f'Initialising redis driver.')
     self.config = env.config
     self.driver: Redis = redis.StrictRedis(
         host=self.config.redis_host_address,
         port=self.config.redis_port,
         decode_responses=True)
     self.spark_helper: SparkHelper = env.spark_helper
     self.event_dispatcher = event_dispatcher
     atexit.register(self.driver.close)
Ejemplo n.º 7
0
 def __init__(self, env: EnvProvider, multi_helper: MultiHelper,
              event_dispatcher: EventDispatcher):
     self.is_ready = False
     self.event_dispatcher = event_dispatcher
     self.log = log_helper.get_logger(logger_name=self.__class__.__name__)
     self.config = env.config
     try:
         self.neo_db: NeoDB = NeoDB(config=self.config,
                                    event_dispatcher=self.event_dispatcher)
         self.redis_db: RedisDB = RedisDB(
             env=env, event_dispatcher=self.event_dispatcher)
         self.multi_helper: MultiHelper = multi_helper
         self.is_ready = True
     except Exception as the_exception:
         self.log.error(the_exception, exc_info=True)
         self.is_ready = False
     IngestionManager.supported_operations = (
         self.config.nodes_ingestion_operation,
         self.config.edges_ingestion_operation)
Ejemplo n.º 8
0
def test_delete_nodes_by_property(config_helper, neo, logger):
    prepare_neo(config_helper=config_helper, log=logger, neo=neo)
    log: Logger = log_helper.get_logger(logger_name='Timing...')

    db: neo_db.NeoDB = neo
    test_label = config_helper.test_labels[0]
    sleep(3)

    query = f"MATCH(n:{test_label}) return count(n) as count"
    before_deletion_count = db.pull_query(query=query).value()[0]
    log.info(f'Before deletion there are {before_deletion_count} items.')
    timer = Timer()
    timer.start()
    result: dict = db.delete_nodes_by_properties(label=test_label, property_name_value_tuples=[('name', 'Arafat')])
    sec = timer.stop()
    log.info(f'Time elapsed: {sec} seconds')
    after_deletion_count = db.pull_query(query=query).value()[0]
    log.info(f'After deletion there are {after_deletion_count} items.')
    deleted_count = result['total']
    assert deleted_count > 0
    assert before_deletion_count - after_deletion_count == deleted_count
Ejemplo n.º 9
0
 def __init__(self, env: EnvProvider, progress_monitor: ProgressMonitor,
              event_dispatcher: EventDispatcher):
     self.event_dispatcher = event_dispatcher
     self.progress_monitor = progress_monitor
     self.is_ready = False
     self.log = log_helper.get_logger(logger_name=__name__)
     # noinspection PyBroadException
     try:
         self.redis_db = RedisDB(event_dispatcher=self.event_dispatcher,
                                 env=env)
         self.data_and_model_provider = env.data_and_model_provider
         self.thread_pool = ThreadPool()
         self.data_to_graph_translator = env.data_to_graph_entities_provider
         self.multi_helper = MultiHelper(config=env.config)
         self.im = IngestionManager(env=env,
                                    multi_helper=self.multi_helper,
                                    event_dispatcher=self.event_dispatcher)
         self.is_ready = True
     except Exception as the_exception:
         self.log.error(the_exception, exc_info=True)
         self.is_ready = False
Ejemplo n.º 10
0
    def __init__(self, cmd_line_args=None):
        self.log = log_helper.get_logger(logger_name=__name__)
        configuration_ini_file_path = None if cmd_line_args is None else cmd_line_args[0].config_ini
        if configuration_ini_file_path is None:
            self.config = config_helper.get_config()
        else:
            config_ini_file = configuration_ini_file_path
            validate_is_file(file_path=config_ini_file)
            self.config = config_helper.get_config(configurations_ini_file_path=config_ini_file)

        # -------

        self.logging_file_path = os.path.join(self.config.logs_storage_folder, 'giraffe.log')
        file_handler = RotatingFileHandler(filename=self.logging_file_path,
                                           mode='a',
                                           maxBytes=1_000_000,
                                           backupCount=3,
                                           encoding='utf-8',
                                           delay=False)
        file_handler.setFormatter(log_helper.log_row_format)
        log_helper.add_handler(handler=file_handler)

        atexit.register(log_helper.stop_listener)

        # -------

        self.execution_env = self.config.execution_environment

        # -------

        if self.execution_env == 'dev':
            self.data_and_model_provider: DataAndModelProvider = MockDataAndModelProvider()
            self.data_to_graph_entities_provider: DataToGraphEntitiesProvider = MockDataToGraphEntitiesProvider()
            self.spark_helper = DevSparkHelper(config=self.config)
        elif self.execution_env == 'cortex':
            raise NotImplementedError('Implemented in cortex')
        else:
            self.log.info(f'Unexpected value in configuration file for execution_environment: {self.execution_env}')
            sys.exit(1)
Ejemplo n.º 11
0
from waitress import serve

coordinator: Coordinator
progress_monitor: ProgressMonitor

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Front Desk')
    parser.add_argument('--config_ini', type=str, required=False, default=None)
    args = parser.parse_known_args()

    env = EnvProvider(cmd_line_args=args)

    event_dispatcher = EventDispatcher()
    log = log_helper.get_logger(__name__)

    log.info(f'Execution environment: {env.execution_env}')

    not_acceptable_error_code = 406

    log.debug('Initializing coordinator module.')
    progress_monitor = ProgressMonitor(event_dispatcher=event_dispatcher,
                                       config=env.config)
    atexit.register(progress_monitor.dump_and_clear_memory)
    coordinator = Coordinator(env=env,
                              progress_monitor=progress_monitor,
                              event_dispatcher=event_dispatcher)
    if not coordinator.is_ready:
        log.error('Failed initializing coordinator component - aborting.')
        sys.exit(-1)
Ejemplo n.º 12
0
def logger() -> Logger:
    return log_helper.get_logger('Testing-Suite', )
Ejemplo n.º 13
0
 def __init__(self, handlers: List[DbLogHandler]):
     self.map_job_id_to_name = {}
     self.handlers = handlers
     self.log = log_helper.get_logger(logger_name=self.__class__.__name__)
Ejemplo n.º 14
0
 def __init__(self, config: ConfigHelper):
     self.config = config
     self.thread_executor = ThreadPoolExecutor(
         max_workers=config.thread_pool_size)
     self.log = log_helper.get_logger(logger_name='Multi-Helper')
     self.futures: List[Future] = []
Ejemplo n.º 15
0
    def __init__(
            self,
            configurations_ini_file_path: str = default_configurations_file):
        self.log = log_helper.get_logger(logger_name=__name__)

        validate_is_file(file_path=configurations_ini_file_path)

        neo4j_section = 'NEO4J'
        redis_section = 'REDIS'
        testing_section = 'TESTING'
        giraffe_section = 'GIRAFFE'
        general_section = 'GENERAL'
        spark_section = 'SPARK'
        elastic_section = 'ELASTICSEARCH'

        config_file_path = os.path.abspath(configurations_ini_file_path)
        self.log.info(f'Configuration file: {config_file_path}')
        self.config = configparser.ConfigParser()

        self.config.read(configurations_ini_file_path)

        self.log.debug(
            f'Found the following configuration sections: {self.config.sections()}'
        )

        # General settings

        if self.config.has_section(section=neo4j_section):
            self.string_encoding = self.config[general_section][
                'string_encoding']
        else:
            self.log.warning(
                f'No configuration found for section {general_section}')

        # Reading Neo4j connection details from configuration-file

        if self.config.has_section(section=neo4j_section):
            self.neo_host_address = self.config[neo4j_section]['HOST']
            self.neo_username = self.config[neo4j_section]['USERNAME']
            self.neo_password = self.config[neo4j_section]['PASSWORD']
            self.neo_bolt_port = self.config[neo4j_section]['BOLT_PORT']
        else:
            self.log.warning(
                f'No configuration found for section {neo4j_section}')

        self.neo_bolt_uri = f'bolt://{self.neo_host_address}:{self.neo_bolt_port}'

        if self.config.has_section(section=redis_section):
            # Reading REDIS connection details from configuration-file
            self.redis_host_address = self.config[('%s' %
                                                   redis_section)]['HOST']
            self.redis_username = self.config[redis_section]['USERNAME']
            self.redis_password = self.config[redis_section]['PASSWORD']
            self.redis_port = self.config[redis_section]['PORT']
            self.redis_stream_milliseconds_block = self.config[redis_section][
                'STREAM_BLOCK_MILLISECONDS']
        else:
            self.log.warning(
                f'No configuration found for section {redis_section}')

        if self.config.has_section(section=testing_section):
            # Unit-Testing settings
            self.test_labels = self.config[testing_section][
                'test_labels'].split(',')
            self.test_edge_type = self.config[testing_section][
                'test_edge_type']
            self.test_property = self.config[testing_section]['test_property']
            self.number_of_test_nodes = int(
                self.config[testing_section]['number_of_test_nodes'])
            self.number_of_test_edges = int(
                self.config[testing_section]['number_of_test_edges'])
            self.test_chunk_size = int(
                self.config[testing_section]['test_chunk_size'])
            self.test_job_name = self.config[testing_section][
                'test_request_name']
            self.test_elasticsearch_index = self.config[testing_section][
                'test_elasticsearch_index']
            self.test_redis_table_prefix = self.config[testing_section][
                'test_redis_table_prefix']
            self.test_redis_stream_name = self.config[testing_section][
                'test_redis_stream_name']
            self.test_front_desk_address = self.config[testing_section][
                'test_front_desk_address']

        else:
            self.log.warning(
                f'No configuration found for section {testing_section}')

        if self.config.has_section(section=giraffe_section):
            # Giraffe logic
            self.nodes_ingestion_operation = self.config[giraffe_section][
                'nodes_ingestion_operation']
            self.edges_ingestion_operation = self.config[giraffe_section][
                'edges_ingestion_operation']
            self.key_separator = self.config[giraffe_section]['key_separator']
            self.uid_property = self.config[giraffe_section][
                'unique_identifier_property_name']
            self.from_uid_property = self.config[giraffe_section][
                'from_uid_property_name']
            self.to_uid_property = self.config[giraffe_section][
                'to_uid_property_name']
            self.edge_type_property = self.config[giraffe_section][
                'edge_type_property_name']
            self.deletion_batch_size = self.config[giraffe_section][
                'deletion_batch_size']
            self.model_vertex_id_prop_name = self.config[giraffe_section][
                'model_vertex_id_prop_name']
            self.model_edge_id_prop_name = self.config[giraffe_section][
                'model_edge_id_prop_name']
            self.edge_to_identifier_name = self.config[giraffe_section][
                'edge_to_identifier_name']
            self.inception_models_rest_address = self.config[giraffe_section][
                'inception_models_rest_address']
            self.inception_data_sources_rest_address = self.config[
                giraffe_section]['inception_data_sources_rest_address']
            self.data_source_parts_separator = self.config[giraffe_section][
                'data_source_parts_separator']
            self.expected_number_of_source_parts = int(
                self.config[giraffe_section]
                ['expected_number_of_source_parts'])
            self.front_desk_port = int(
                self.config[giraffe_section]['front_desk_port'])
            self.redis_stream_name = self.config[giraffe_section][
                'redis_stream_name']
            self.ingestion_endpoint = self.config[giraffe_section][
                'ingestion_endpoint'].strip()
            self.redis_get_all_endpoint = self.config[giraffe_section][
                'redis_get_all_endpoint'].strip()
            self.request_mandatory_field_names = eval(
                self.config[giraffe_section]
                ['request_type_mandatory_field_name'])
            self.logs_storage_folder = self.config[giraffe_section][
                'logs_storage_folder']
            self.progress_monitor_dump_folder = self.config[giraffe_section][
                'progress_monitor_dump_folder']
            self.admin_db_table_name = self.config[giraffe_section][
                'admin_db_table_name']
            self.required_request_fields: Dict[str, Dict] = eval(
                self.config[giraffe_section]['required_request_fields'])
            self.hash_uid_column = eval(
                self.config[giraffe_section]['hash_uid_column'])
            self.front_desk_ip = self.config[giraffe_section]['front_desk_ip']
            self.execution_environment = self.config[giraffe_section][
                'execution_environment']
            self.logs_structured_prefix = self.config[giraffe_section][
                'logs_structured_prefix']
            self.logs_fluentd_host = self.config[giraffe_section][
                'logs_fluentd_host']
            self.logs_fluentd_port = int(
                self.config[giraffe_section]['logs_fluentd_port'])
            self.thread_pool_size = int(
                self.config[giraffe_section]['thread_pool_size'])
            self.property_names_to_index = eval(
                self.config[giraffe_section]['property_names_to_index'])
        else:
            self.log.warning(
                f'No configuration found for section {giraffe_section}')

        if self.config.has_section(section=spark_section):
            # Spark logic
            self.external_jars_folder = self.config[spark_section][
                'external_jars']
            self.spark_app_name = self.config[spark_section]['app_name']
        else:
            self.log.warning(
                f'No configuration found for section {spark_section}')

        if self.config.has_section(section=elastic_section):
            # ElasticSearch logic
            self.es_host_address = self.config[elastic_section]['HOST']
        else:
            self.log.warning(
                f'No configuration found for section {elastic_section}')