Beispiel #1
0
def consume(args):
    schema = args.schema
    table = args.table
    skip_error = args.skip_error
    assert schema in settings.SCHEMAS, 'schema must in settings.SCHEMAS'
    assert table in settings.TABLES, 'table must in settings.TABLES'
    group_id = f'{schema}.{table}'
    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=group_id,
        auto_offset_reset='earliest',
    )
    topic = settings.KAFKA_TOPIC
    partition = settings.PARTITIONS.get(group_id)
    tp = TopicPartition(topic, partition)
    consumer.assign([tp])

    event_list = []
    is_insert = False

    logger.info(
        f'success consume topic:{topic},partition:{partition},schema:{schema},table:{table}'
    )

    pk = reader.get_primary_key(schema, table)

    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_list.append(event)
        len_event = len(event_list)
        high_water = consumer.highwater(tp)
        lag = (high_water - 1) - msg.offset
        if lag > settings.INSERT_NUMS:
            if len_event == settings.INSERT_NUMS:
                is_insert = True
        else:
            if (int(time.time() * 10**6) - event_list[0]['event_unixtime']
                ) / 10**6 >= settings.INSERT_INTERVAL > 0:
                is_insert = True
        if is_insert:
            data_dict = {}
            tmp_data = []
            for items in event_list:
                action = items['action']
                action_core = items['action_core']
                data_dict.setdefault(table + schema + action + action_core,
                                     []).append(
                                         dict(items,
                                              schema=schema,
                                              table=table))
            for k, v in data_dict.items():
                tmp_data.append(v)
            result = writer.insert_event(tmp_data, settings.SKIP_TYPE,
                                         settings.SKIP_DELETE_TB_NAME, schema,
                                         table, pk)
            if result or (not result and skip_error):
                event_list = []
                is_insert = False
                consumer.commit()
                logger.info(f'commit success {len_event} events!')
            else:
                logger.error('insert event error!')
                exit()