def create_buffer_table(client: ClickhouseHelper, db, table,
                        table_map) -> bool:
    logging = get_basic_utilities().get(LOGGER)
    buffer_table = f'{db}.{table}_Buffer'
    if client.is_table_exists(buffer_table):
        return True
    # noinspection PyBroadException
    try:
        # noinspection SqlDialectInspection
        table_schema_query = '''
            SELECT create_table_query, engine_full
            FROM system.tables
            WHERE database = %(db)s AND name = %(table)s
        '''
        result = client.execute(query=table_schema_query,
                                params={
                                    'db': db,
                                    'table': table
                                })
        schema, engine_details = result[0]
        schema = schema.replace(engine_details, '')
        schema = schema.replace(f'{db}.{table}', f'{buffer_table}')
        buffer = table_map['buffer']
        buffer_schema = f"{schema} Buffer({db}, {table}, {buffer['num_layers']}, {buffer['min_time']}, " \
                        f"{buffer['max_time']}, {buffer['min_rows']}, {buffer['max_rows']}, {buffer['min_bytes']}, " \
                        f"{buffer['max_bytes']})"
        client.execute(query=buffer_schema)
        return True
    except Exception as e:
        logging.error(f'failed to create to {buffer_table}: {e}',
                      exc_info=True)
        return False
def on_consumer_failure(consumer_task: Task, e, trace):
    config, mail_client = get_basic_utilities().get_utils((CONFIG, SMTP_CLIENT))
    logging = get_logger()
    alert_config = config['alert']
    logging.error(f'{consumer_task.name} consumer failed: error {e}: {trace}')
    mail_client.send(alert_config["sender"], alert_config["receivers"],
                     f'CH-Sync: {consumer_task.name} consumer failed')
Esempio n. 3
0
def get_logger():
    config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER))
    if 'logs_path' in config['logs']:
        logs_path = str(
            pathlib.Path(config['logs']['logs_path']).joinpath('producer'))
        return prepare_logger(logs_path, os.getenv('env'))
    return logging
def on_consumer_terminate(consumer_task: Task):
    config, mail_client = get_basic_utilities().get_utils((CONFIG, SMTP_CLIENT))
    logging = get_logger()
    alert_config = config['alert']
    logging.error(f'{consumer_task.name} consumer terminated')
    mail_client.send(alert_config["sender"], alert_config["receivers"],
                     f'CH-Sync: {consumer_task.name} consumer terminated')
def main():
    utils = get_basic_utilities()
    config = utils.get(CONFIG)
    logging = get_logger()
    logging.info('starting the consumers')

    redis_config = config['redis']

    # start the supported consumers
    supported_consumers = get_supported_consumers()
    max_read_fails_allowed = config['consumer']['max_read_fails_allowed']
    min_up_time = config['consumer']['min_up_time']
    consumer_tasks = []
    for consumer in supported_consumers:
        task = Task(func=data_consumer, args=(consumer,), kwargs={}, name=consumer,
                    err_call_back=on_consumer_failure, term_call_back=on_consumer_terminate,
                    max_restarts=max_read_fails_allowed, min_up_time=min_up_time,
                    restart_delay=config['consumer']['restart_delay'],
                    redis=get_redis_client(redis_config['host'], redis_config['port'], redis_config['db']))
        task.start()
        consumer_tasks.append(task)

    # handle abnormal termination
    signal.signal(signal.SIGINT, lambda *x: exit_gracefully(consumer_tasks))
    signal.signal(signal.SIGTERM, lambda *x: exit_gracefully(consumer_tasks))

    # wait for all tasks to finish
    tasks_not_active = threading.Event()
    check_tasks_completed(consumer_tasks, tasks_not_active)
    tasks_not_active.wait()

    logging.info('consumers finished')
def handle_consumers(consumers, option):
    console = Console()
    utils = get_basic_utilities()
    config, logging = utils.get_utils((CONFIG, LOGGER))
    redis_config = config['redis']
    redis_helper = get_singleton_redis_client(redis_config['host'],
                                              redis_config['port'],
                                              redis_config['db'])
    task_manager = TaskManager(redis_helper)
    operation = options_map[option]
    active_consumers = []
    for consumer in consumers:
        status = task_manager.get_task_info(consumer)
        if not status or status == Status.COMPLETE.name:
            console.print(f'[bold red] {consumer} not running[/bold red]')
        else:
            active_consumers.append(consumer)
    if len(active_consumers) < 1:
        return
    if option == 'INFO':
        task_manager.display_info(active_consumers)
    if option == 'STATUS':
        task_manager.display_status(active_consumers)
    if option == Status.ACTIVE.name or option == Status.INACTIVE.name or option == Status.RESTARTING.name:
        for consumer in active_consumers:
            # noinspection PyArgumentList
            _ = operation(task_manager, consumer)
        task_manager.display_status(active_consumers)
    return
def delete_topics(topics, time_out=10):
    # noinspection PyBroadException
    config = get_basic_utilities().get(CONFIG)
    kafka_config = config['kafka']
    server = f"{kafka_config['host']}:{kafka_config['port']}"
    admin_client = KafkaAdminClient(bootstrap_servers=server)
    consumer = KafkaConsumer(bootstrap_servers=server)
    active_topics = [topic for topic in consumer.topics() if topic in topics]
    admin_client.delete_topics(topics=active_topics)
    stop = threading.Event()
    all_deleted = False

    def is_deleted():
        nonlocal all_deleted
        while not stop.is_set():
            current_topics = consumer.topics()
            for topic in active_topics:
                if topic in current_topics:
                    stop.wait(timeout=1)
                    break
            else:
                stop.set()
                all_deleted = True

    thread = threading.Thread(name='delete-topics', target=is_deleted)
    thread.start()
    stop.wait(timeout=time_out)
    stop.set()
    thread.join()
    return all_deleted
def loader(load_all, exclude, collections, store_tick, batch_size):
    click.confirm('clickhouse table will be re-created with new data', abort=True)

    utils = get_basic_utilities()
    logging = utils.get(LOGGER)

    if not load_all and not collections:
        loader.error('inout not provided')
        return False

    enabled_consumers = get_supported_consumers()

    if load_all:
        all_collections = enabled_consumers
        collections = all_collections if exclude is None else [col for col in all_collections if
                                                               col not in exclude.split(',')]
    else:
        collections = collections.split(',')

    for collection in collections:
        # noinspection PyBroadException
        try:
            load_collection_data(collection, store_tick, batch_size)
        except Exception:
            logging.error(f'clickhouse {collection} loader failed: {traceback.format_exc()}')
            return False

    return True
Esempio n. 9
0
def get_supported_consumers():
    utils = get_basic_utilities()
    config = utils.get(CONFIG)
    collections = config['producer']['sync']
    exclude = config['consumer']['exclude']
    return [
        collection for collection in collections if collection not in exclude
    ]
def create_test_table(table):
    config = get_basic_utilities().get(CONFIG)
    ch_client = get_ch_client_with_dict_config(config['clickhouse'])
    table_map = get_table_map(table)
    assert table_map is not None
    if ch_client.is_table_exists(f'{table_map["clickhouse"]}'):
        return True
    clickhouse_table = table_map['table_create']
    ch_client.execute(query=clickhouse_table)
    return True
Esempio n. 11
0
def main():
    utils = get_basic_utilities()
    config, alert = utils.get_utils((CONFIG, SMTP_CLIENT))
    logging = get_logger()
    alert_config = config['alert']
    logging.info('log-producer started')
    # noinspection PyBroadException
    try:
        producer()
    except Exception:
        logging.error(f"producer failed: {traceback.format_exc()}")
        alert.send(alert_config["sender"], alert_config["receivers"],
                   "CH-Sync: Producer failed")
Esempio n. 12
0
def cleanup():
    yield None
    config = get_basic_utilities().get(CONFIG)
    arango_client = get_singleton_arango_client(config['arango'])
    collection = 'test'
    collection_exists = collection in arango_client.db.collections
    if collection_exists:
        arango_client.db.collections[collection].delete()
        del arango_client.db.collections[collection]
    clickhouse: ClickhouseHelper = get_singleton_ch_client(
        config['clickhouse'])
    clickhouse.drop_table_if_exists(collection)
    clickhouse.drop_table_if_exists(f'{collection}_Buffer')
def get_table_map(table):
    config = get_basic_utilities().get(CONFIG)
    table_config = load_schema_mapper(f'{table}.yaml')
    schema = {
        'arango': table,
        'clickhouse': table_config['table_name'],
        'clickhouse_db': config['clickhouse']['database'],
        'table_create': table_config['table'],
        'schema': table_config['schema']
    }
    if 'buffer' in table_config:
        schema['buffer'] = table_config['buffer']
    if 'topic_config' in table_config:
        schema['topic_config'] = table_config['topic_config']
    return schema
def validate_table_names(_, __, value):
    config = get_basic_utilities().get(CONFIG)
    allowed_tables = sorted(config['producer']['sync'])
    if value.strip() == '':
        tables = []
    else:
        tables = [table.strip() for table in value.split(',')]
    not_allowed = []
    for table in tables:
        if table not in allowed_tables:
            not_allowed.append(table)
    if len(not_allowed) > 0:
        raise click.BadParameter(
            'tables {} are not allowed.\nAllowed tables:\n{}'.format(
                ', '.join(not_allowed), ',\n'.join(allowed_tables)))
    return tables
def create_topic(table):
    config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER))
    table_map = get_table_map_by_arango_collection(table)
    if not table_map:
        return False
    kafka_config = config['kafka']
    admin_client = KafkaAdminClient(
        bootstrap_servers=f"{kafka_config['host']}:{kafka_config['port']}", )

    # create kafka topic
    custom_topic_configs = table_map[
        'topic_config'] if 'topic_config' in table_map else {}
    topic_config = {
        'name': table,
        'num_partitions': 1,
        'replication_factor': 1,
        'topic_configs': custom_topic_configs
    }
    new_topic = NewTopic(**topic_config)
    admin_client.create_topics([new_topic])
    logging.info(f'{table} topic created')

    return True
Esempio n. 16
0
def get_supported_producers():
    utils = get_basic_utilities()
    config = utils.get(CONFIG)
    collections = config['producer']['sync']
    return collections
def test_get_basic_utilities(basic_utilities):
    get_basic_utilities()
    assert True
def data_consumer(consumer_name, stop_event: threading.Event):
    config = get_basic_utilities().get(CONFIG)
    logging = get_logger()

    logging.info(f'{consumer_name} started')

    # initialize necessary config
    kafka_config, consumer_config = config['kafka'], config['consumer']

    # initialize redis
    redis_config = config['redis']
    redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db'])
    initial_tick = get_initial_tick_of_consumer(redis_helper, consumer_name)

    # initialize clickhouse
    ch_client = get_ch_client_with_dict_config(config['clickhouse'])
    table_map = get_table_map_by_arango_collection(consumer_name)
    if table_map is None:
        logging.error('table map is not available')
        return False

    use_buffer = 'buffer' in table_map
    ch_table = f"{table_map['clickhouse']}_Buffer" if use_buffer else table_map['clickhouse']
    primary_key = table_map['schema']['primary_key']

    # create buffer table if not present
    if use_buffer:
        created = create_buffer_table(ch_client, table_map['clickhouse_db'], table_map['clickhouse'], table_map)
        if not created:
            logging.error('failed to create buffer table')
            return False

    # initialize kafka consumer
    consumer = custom_connect_consumer(kafka_config['host'], kafka_config['port'], consumer_name, consumer_name)
    idle, max_records, time_out = (consumer_config['idle'], consumer_config['kafka_max_records'],
                                   consumer_config['kafka_poll_time_out'])

    # noinspection PyBroadException
    try:
        while not stop_event.is_set():
            logging.info(f'{consumer_name}: polling for messages')
            msg_pack = consumer.poll(timeout_ms=time_out, max_records=max_records)

            for topic, messages in msg_pack.items():

                # _ = handle_messages([m.value for m in messages], consumer_name)
                documents = [{'offset': m.offset, 'doc': m.value} for m in messages]
                documents = pre_process_documents(initial_tick, documents)

                # set to none to skip initial tick validation
                if len(documents) > 0:
                    initial_tick = None

                # transform the documents
                documents, errors = transform_documents(table_map['schema'], documents)
                log_error_documents(errors)

                # insert the documents
                processed_count = bulk_insert_documents(ch_client, ch_table, documents)

                # log the documents
                for document in documents:
                    # logging.debug(f'{consumer_name}: message: {document}')
                    logging.info(f'{consumer_name}: processed: {document[primary_key]}, ver: {document["_ver"]}')

                logging.info(f'{consumer_name}: processed {processed_count} docs')

            # update offset in kafka
            consumer.commit()

            # idle the process
            is_messages_consumed = all_messages_consumed(consumer)
            if is_messages_consumed:
                logging.info(f'{consumer_name} process idle')
                stop_event.wait(timeout=idle)

    except Exception as e:
        consumer.close()
        raise e

    logging.info(f'{consumer_name} exited gracefully')
def test_get_basic_utilities_singleton(basic_utilities):
    new_basic_utilities = get_basic_utilities()
    assert new_basic_utilities is basic_utilities
def basic_utilities():
    return get_basic_utilities()
Esempio n. 21
0
def producer():
    utils = get_basic_utilities()
    config, alert = utils.get_utils((CONFIG, SMTP_CLIENT))
    logging = get_logger()
    producer_config = config['producer']

    arango_wal_client = get_wal_client({**config['arango'], **config['wal']})
    redis_helper = get_singleton_redis_client(config['redis']['host'],
                                              config['redis']['port'],
                                              config['redis']['db'])

    last_tick_file = open('last-tick.txt', 'w')

    arango_collections = get_arango_collections(config['arango'])
    collections_id_dict = {
        collection: meta.globallyUniqueId
        for collection, meta in arango_collections.items()
        if collection in producer_config['sync']
    }
    id_to_collection_dict = get_id_collection_map(collections_id_dict)

    logging.info(f'listening collections: {list(collections_id_dict.keys())}')

    # set last-tick as first tick during only the first start
    init_tick = set_tick_if_not_set(arango_wal_client, redis_helper)
    if init_tick:
        logging.info(f'stored initial tick: {init_tick}')

    _ = config['producer']['reader_batch']
    writer_timeout = config['producer']['writer_timeout']

    log_writer = get_log_writer(config['kafka']['host'],
                                config['kafka']['port'], key_encode,
                                json_encode)()

    exit_event = threading.Event()
    _ = Terminate(exit_event)

    while not exit_event.is_set():
        last_tick = get_last_processed_tick(redis_helper)
        logging.info(f'last processed tick: {last_tick}')

        logs_collector = collect_logs(arango_wal_client, last_tick, None,
                                      collections_id_dict)
        logs_generator = LogGenerator(logs_collector)

        for docs in logs_generator:

            tick_start = docs['content'][0]['tick'] if len(
                docs['content']) > 0 else None

            if not docs['from_present']:
                logging.error(
                    f'ticks lost asked for {last_tick} but got {tick_start}')

            # store in kafka
            log_writer.bulk_write(
                prepare_kafka_documents(id_to_collection_dict, docs,
                                        writer_timeout))
            log_writer.flush()

            # update tick only if valid
            if int(docs['last_included']) > 0:
                if updated_last_processed_tick(redis_helper,
                                               docs['last_included']):
                    update_file_last_tick(last_tick_file,
                                          docs['last_included'])
                    # if is_processed set to False then the data batch will be processed again
                    # setting False always will lead to infinite loop
                    logs_generator.is_processed(True)

            logging.info(
                f'processed {f"{tick_start}-" if tick_start else ""}{docs["last_included"]}: '
                f'overall {len(docs["content"])} docs')

            # handle termination call
            if exit_event.is_set():
                break

        logging.info('sleeping')
        exit_event.wait(timeout=config['producer']['idle'])

    logging.info('producer terminated gracefully')
Esempio n. 22
0
def get_redis_client():
    config = get_basic_utilities().get(CONFIG)
    redis_helper = get_singleton_redis_client(config['redis']['host'],
                                              config['redis']['port'],
                                              config['redis']['db'])
    return redis_helper
def synchronizer(tables, clear):
    config, logging = get_basic_utilities().get_utils((CONFIG, LOGGER))
    redis_config = config['redis']
    redis_helper = get_singleton_redis_client(redis_config['host'],
                                              redis_config['port'],
                                              redis_config['db'])

    try:
        generate_config_file()
    except (FileNotFoundError, Exception) as e:
        logging.error(f'unable to generate pm2 config file: {e}',
                      exc_info=True)
        return False

    pm2_config_path = get_config_path()
    producer_process = PM2('arango-producer', pm2_config_path)
    consumer_process = PM2('clickhouse-consumer', pm2_config_path)
    task_manager = TaskManager(redis_helper)

    # clear redis cache db if specified
    if clear:
        redis_helper.client.flushdb()
        logging.info('redis cache cleared')
    else:
        # delete consumer specific keys
        for table in tables:
            for key in redis_helper.client.keys(f'{table}*'):
                redis_helper.client.delete(key)

    # stop the producer process
    if not producer_process.stop():
        logging.error('unable to stop producer')
        return False

    # stop the consumer process
    for table in tables:
        consumer_active = task_manager.ping(table)
        if consumer_active:
            result = task_manager.stop_task(table)
            if result == Status.INACTIVE.name:
                logging.info(f'stopped the consumer {table}')
            else:
                logging.error(f'unable to stop consumer {table}')
                return False
        else:
            logging.info(f'consumer {table} not active')

    # delete topics
    all_deleted = delete_topics(tables)
    if not all_deleted:
        logging.error(f'unable to delete all kafka topics')
        return False

    # create topic
    for table in tables:
        created = create_topic(table)
        if not created:
            logging.error(f'unable to delete topic: {table}')
            return False

    # start producer process
    if not producer_process.start():
        logging.error('unable to start producer')
        return False

    # sync existing collection data
    for table in tables:
        is_data_loaded = load_collection_data(collection=table,
                                              store_tick=True,
                                              batch_size=100000)
        if is_data_loaded:
            logging.info('existing data loaded to clickhouse')
        else:
            logging.error(f'failed to load {table} data')
            return False

        # start the consumer
        if task_manager.ping(table):
            result = task_manager.start_task(table)
            if result == Status.ACTIVE.name:
                logging.info(f'{table} consumer process started')
            else:
                logging.error('unable to start consumer, restarting using pm2')

    if consumer_process.restart():
        logging.info('pm2 consumer restarted')
    else:
        logging.error('unable to restart pm2 consumers')
        return False

    return True
def load_collection_data(collection, store_tick, batch_size):
    basic_utils = get_basic_utilities()
    config, logging, mail_client = basic_utils.get_utils((CONFIG, LOGGER, SMTP_CLIENT))

    arango_client = get_singleton_arango_client(config['arango'])

    clickhouse: ClickhouseHelper = get_singleton_ch_client(config['clickhouse'])
    clickhouse_client: Client = clickhouse.client

    clickhouse_table_map = get_table_map_by_arango_collection(collection)
    clickhouse_table, clickhouse_db, clickhouse_table_schema = (
        clickhouse_table_map['clickhouse'], clickhouse_table_map['clickhouse_db'], clickhouse_table_map['schema'])

    # prepare the tables
    clickhouse_temp_table = f'{clickhouse_table}Temp'
    clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_temp_table}')
    temp_table, table_created = create_temporary_table(clickhouse_client, clickhouse_table_map['table_create'],
                                                       f'{clickhouse_db}.{clickhouse_table}',
                                                       f'{clickhouse_db}.{clickhouse_temp_table}')
    logging.info(f'temporary table created for {clickhouse_table}')

    # store current tick for the table in redis
    if store_tick:
        wal_client = get_wal_client({**config['arango'], **config['wal']})
        last_tick = wal_client.get_last_tick()
        redis_config = config['redis']
        redis_helper = get_singleton_redis_client(redis_config['host'], redis_config['port'], redis_config['db'])
        redis_helper.client.set(f'{collection}:last-tick', last_tick['tick'])
        logging.info(f'stored current wal tick: {last_tick}')

    logging.info('collect documents from arango')
    processed_documents = 0
    errors = 0
    for documents in get_all_documents(db_client=arango_client, col_name=collection, batch_size=batch_size):
        logging.info(f'documents collected fom arango: {len(documents)} docs')

        # map the documents from arango to clickhouse document
        for i in range(len(documents)):
            try:
                documents[i] = convert_to_ch_dict_using_schema(clickhouse_table_schema, documents[i])
            except (TypeError, ValueError, KeyError):
                logging.document(f'doc: {documents[i]}')
                logging.document(f'error: {traceback.format_exc()}')
                documents[i] = None
                errors += 1

        # filter invalid documents
        documents = [doc for doc in documents if doc is not None]

        if len(documents) > 0:
            total_insertion = clickhouse.bulk_dict_doc_insert(documents, temp_table, list(documents[0].keys()),
                                                              batch_size)
            logging.info(f'populated data on clickhouse: {total_insertion} docs')
            processed_documents += total_insertion
        logging.info(f'overall processed documents: {processed_documents} docs')

    logging.info('data populated on temporary table')
    clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_table}')
    logging.info(f'dropped table {clickhouse_table}')
    clickhouse.rename_table(temp_table, f'{clickhouse_db}.{clickhouse_table}')
    logging.info('table populated successfully')
    logging.info(f'Incompatible documents: {errors}')

    # prepare buffer table
    if 'buffer' in clickhouse_table_map:
        clickhouse_buffer = f'{clickhouse_table}_Buffer'
        clickhouse.drop_table_if_exists(f'{clickhouse_db}.{clickhouse_buffer}')
        logging.info(f'dropped table {clickhouse_buffer}')
        create_buffer_table(clickhouse, clickhouse_db, clickhouse_table, clickhouse_table_map)
        logging.info('buffer table created successfully')

    return True
Esempio n. 25
0
def get_test_table():
    config = get_basic_utilities().get(CONFIG)
    arango_client = get_singleton_arango_client(config['arango'])
    if 'test' in arango_client.db.collections:
        return arango_client.db.collections['test']
    return arango_client.db.createCollection(name='test')