コード例 #1
0
ファイル: etl.py プロジェクト: CO-CONNECT/co-connect-tools
def _process_list_data(ctx):
    logger = Logger("_process_list_data")
    logger.info("ETL process has begun")

    interactive = ctx.obj['interactive']
    data = []
    clean = ctx.obj['clean']
    rules = ctx.obj['rules']
    bclink_helpers = ctx.obj['bclink_helpers']
    config_file = ctx.obj['conf']
    conf = _load_config(config_file)
    rules_file = conf['rules']
    rules_file_last_modified = os.path.getmtime(rules_file)

    bclink_helpers.print_summary()
    display_msg = True
    _clean = clean

    while True:

        re_execute = False
        try:
            conf = _load_config(config_file)
        except Exception as e:
            if not display_msg:
                logger.critical(e)
                logger.error(
                    f"You've misconfigured your file '{config_file}'!! Please fix!"
                )
            time.sleep(5)
            display_msg = True
            continue

        current_rules_file = conf['rules']
        new_rules_file = rules_file != current_rules_file
        if new_rules_file:
            #if there's a new rules file
            logger.info(
                f"Detected a new rules file.. old was '{rules_file}' and new is '{current_rules_file}'"
            )
            rules_file = current_rules_file
            rules = coconnect.tools.load_json_delta(rules_file, rules)
            rules_file_last_modified = os.path.getmtime(rules_file)
            re_execute = True
        else:
            #otherwise check for changes in the existing file
            new_rules_file_last_modified = os.path.getmtime(current_rules_file)
            change_in_rules = rules_file_last_modified != new_rules_file_last_modified
            if change_in_rules:
                logger.info(
                    f"Detected a change/update in the rules file '{rules_file}'"
                )
                rules = coconnect.tools.load_json_delta(
                    current_rules_file, rules)
                re_execute = True

        current_data = conf['data']
        if not data == current_data:
            logger.debug(f"old {data}")
            logger.debug(f"new {current_data}")
            new_data = [obj for obj in current_data if obj not in data]
            logger.info(f"New data found! {new_data}")
            re_execute = True
        else:
            new_data = data

        logger.debug(f"re-execute {re_execute}")
        if re_execute:
            current_data = copy.deepcopy(new_data)
            #loop over any new data
            for item in new_data:
                if isinstance(item['input'], list):
                    inputs = item['input']
                else:
                    input_folder = item['input']
                    if not os.path.isdir(input_folder):
                        raise Exception(
                            f"{input_folder} is not a directory containing files!"
                        )
                    inputs = coconnect.tools.get_files(input_folder,
                                                       type='csv')
                filtered_rules = coconnect.tools.remove_missing_sources_from_rules(
                    rules, inputs)

                _execute(ctx, data=item, rules=filtered_rules, clean=_clean)
                _clean = False

            data += [x for x in current_data if x not in data]
            display_msg = True

        if new_rules_file or change_in_rules:
            #if there's a new rules file or rules delta,
            #need to pick up the full rules for the next loop
            #incase we insert new data
            # --> we dont want to just apply the delta to the new data
            rules = coconnect.tools.load_json(current_rules_file)

        if ctx.obj['listen_for_changes'] == False:
            break

        if display_msg:
            logger.info(
                f"Finished!... Listening for changes to data in {config_file}")
            if display_msg:
                display_msg = False

        time.sleep(5)