Ejemplo n.º 1
0
def prepare_target_schema(msg):
    """
    Task to create target star schema
    """
    start_time = datetime.datetime.now()
    conf = _get_conf(msg)
    tenant = conf[mk.TENANT_NAME]
    schema = conf[mk.TARGET_DB_SCHEMA]

    create_target_schema_for_batch(tenant, schema)

    end_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH],
                                    msg[mk.LOAD_TYPE],
                                    prepare_target_schema.name,
                                    start_time,
                                    end_time,
                                    task_id=str(
                                        prepare_target_schema.request.id),
                                    working_schema=schema,
                                    tenant=tenant)
    benchmark.record_benchmark()
    return msg
Ejemplo n.º 2
0
def handle_record_upsert(msg):
    '''
    Match records in current batch with production database, such that
    existing records only get updated to avoid duplication.
    '''
    logger.info('LOAD_FROM_INT_TO_STAR: detect duplications in target tables.')
    start_time = datetime.datetime.now()
    conf = _get_conf(msg)
    affected_rows = handle_duplicates_in_dimensions(conf[mk.TENANT_NAME],
                                                    conf[mk.GUID_BATCH])
    finish_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    udl_phase_step = 'Delete duplicate record in dim tables'
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH],
                                    msg[mk.LOAD_TYPE],
                                    handle_record_upsert.name,
                                    start_time,
                                    finish_time,
                                    udl_phase_step=udl_phase_step,
                                    size_records=affected_rows,
                                    task_id=str(
                                        handle_record_upsert.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA],
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    return msg
Ejemplo n.º 3
0
def explode_data_to_dim_table_task(msg, conf, source_table, dim_table,
                                   column_mapping, column_types):
    """
    This is the celery task to move data from one integration table to one dim table.
    :param conf:
    :param source_table:
    :param dim_table:
    :param column_mapping:
    :param column_types:
    :return:
    """
    logger.info('LOAD_FROM_INT_TO_STAR: migrating source table <%s> to <%s>' %
                (source_table, dim_table))
    start_time = datetime.datetime.now()
    affected_rows = explode_data_to_dim_table(conf, source_table, dim_table,
                                              column_mapping, column_types)
    finish_time = datetime.datetime.now()
    _time_as_seconds = calculate_spend_time_as_second(start_time, finish_time)

    # Create benchmark object ant record benchmark
    udl_phase_step = 'INT --> DIM:' + dim_table
    benchmark = BatchTableBenchmark(
        conf[mk.GUID_BATCH],
        conf[mk.LOAD_TYPE],
        explode_data_to_dim_table_task.name,
        start_time,
        finish_time,
        udl_phase_step=udl_phase_step,
        size_records=affected_rows[0],
        task_id=str(explode_data_to_dim_table_task.request.id),
        working_schema=conf[mk.TARGET_DB_SCHEMA],
        udl_leaf=True,
        tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
Ejemplo n.º 4
0
def handle_deletions(msg):
    '''
    This is the celery task to match production database to find out deleted records in a batch
    exists.
    In batch, guid_batch is provided.
    '''
    logger.info('LOAD_FROM_INT_TO_STAR: Handle deletions in target tables.')
    start_time = datetime.datetime.now()
    guid_batch = msg[mk.GUID_BATCH]
    # pass down the affected row from previous stage
    udl_phase_step = 'HANDLE DELETION IN FACT'
    conf = _get_conf(msg)
    conf[mk.UDL_PHASE_STEP] = udl_phase_step

    # handle updates and deletes
    handle_updates_and_deletes(conf)
    finish_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    msg[mk.LOAD_TYPE],
                                    handle_deletions.name,
                                    start_time,
                                    finish_time,
                                    udl_phase_step=udl_phase_step,
                                    tenant=msg[mk.TENANT_NAME],
                                    task_id=str(handle_deletions.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file decrypter
    outgoing_msg = {}
    outgoing_msg.update(msg)
    return outgoing_msg
def task(msg):
    start_time = datetime.datetime.now()
    logger.info(task.name)
    logger.info('LOAD_CSV_TO_STAGING: Loading file <%s> ' %
                (msg[mk.FILE_TO_LOAD]))
    guid_batch = msg[mk.GUID_BATCH]
    conf = generate_conf_for_loading(msg[mk.FILE_TO_LOAD], msg[mk.ROW_START],
                                     msg[mk.LOAD_TYPE], msg[mk.HEADERS],
                                     guid_batch, msg[mk.TENANT_NAME])
    load_file(conf)
    end_time = datetime.datetime.now()

    #Record benchmark
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH],
                                    msg[mk.LOAD_TYPE],
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA],
                                    udl_leaf=True,
                                    size_records=msg[mk.SIZE_RECORDS],
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    return msg
def task(incoming_msg):
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg.get(mk.GUID_BATCH)
    tenant_directory_paths = incoming_msg.get(mk.TENANT_DIRECTORY_PATHS)
    expanded_dir = tenant_directory_paths.get(mk.EXPANDED)

    load_type = get_load_type(expanded_dir)

    logger.info('W_GET_LOAD_TYPE: Load type is <%s>' % load_type)
    end_time = datetime.datetime.now()

    # benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    tenant=incoming_msg.get(mk.TENANT_NAME))
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file validator
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({mk.LOAD_TYPE: load_type})
    if load_type == Constants.LOAD_TYPE_ASSESSMENT:
        assessment_type = get_assessment_type(expanded_dir)
        outgoing_msg.update({mk.ASSESSMENT_TYPE: assessment_type})
    # Update UDL stats
    update_udl_stats_by_batch_guid(guid_batch,
                                   {UdlStatsConstants.LOAD_TYPE: load_type})
    return outgoing_msg
def task(incoming_msg):
    """
    This is the celery task to decrypt the source file
    """
    start_time = datetime.datetime.now()
    file_to_decrypt = incoming_msg[mk.FILE_TO_DECRYPT]
    passphrase = udl2_conf['passphrase']
    guid_batch = incoming_msg[mk.GUID_BATCH]
    gpghome = udl2_conf['gpg_home']
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    decrypt_to_dir = tenant_directory_paths[mk.DECRYPTED]
    load_type = incoming_msg[mk.LOAD_TYPE]

    logger.info('W_FILE_DECRYPTER: received file {file} with guid_batch {guid_batch}'.format(file=file_to_decrypt,
                                                                                             guid_batch=guid_batch))
    logger.info('W_FILE_DECRYPTER: Decrypt to {dir}'.format(dir=decrypt_to_dir))

    status, decrypted_file = decrypt_file(file_to_decrypt, decrypt_to_dir, passphrase, gpghome)
    logger.info('Decrypted file: {file}'.format(file=decrypted_file))
    logger.info('Decryption status: {status}'.format(status=str(status)))

    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time, task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({mk.FILE_TO_EXPAND: decrypted_file})
    return outgoing_msg
Ejemplo n.º 8
0
def task(incoming_msg):
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg[mk.GUID_BATCH]
    load_type = incoming_msg[mk.LOAD_TYPE]

    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]

    sfv = SimpleFileValidator(load_type)
    error_map = {}
    for file_name in os.listdir(expanded_dir):
        error_map[file_name] = sfv.execute(expanded_dir, file_name, guid_batch)

    # TODO: Add logic that checks error list and writes to a log/db/etc
    for input_file in error_map.keys():
        errors = error_map[input_file]
        if len(errors) == 0:
            logger.info('FILE VALIDATOR: Validated file <%s> and found no errors.' % (os.path.join(expanded_dir, input_file)))
        else:
            # TODO: Jump to ERROR_TASK
            for error in errors:
                logger.error('ERROR: ' + str(error))
            raise Exception('simple file validator error: %s' % errors)

    end_time = datetime.datetime.now()

    # benchmark
    benchmark = BatchTableBenchmark(guid_batch, incoming_msg[mk.LOAD_TYPE], task.name, start_time, end_time, task_id=str(task.request.id),
                                    tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file splitter
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    return outgoing_msg
Ejemplo n.º 9
0
def explode_data_to_fact_table_task(msg, conf, source_table, fact_table,
                                    column_mapping, column_types):
    """
    This is the celery task to move data from one integration table to one fact table.
    :param conf:
    :param source_table:
    :param fact_table:
    :param column_mapping:
    :param column_types:
    :return:
    """
    logger.info('LOAD_FROM_INT_TO_STAR: migrating source table <%s> to <%s>' %
                (source_table, fact_table))
    start_time = datetime.datetime.now()
    affected_rows = explode_data_to_fact_table(conf, source_table, fact_table,
                                               column_mapping, column_types)
    finish_time = datetime.datetime.now()

    benchmark = BatchTableBenchmark(
        conf[mk.GUID_BATCH],
        conf[mk.LOAD_TYPE],
        explode_data_to_fact_table_task.name,
        start_time,
        finish_time,
        udl_phase_step='INT --> FACT:' + fact_table,
        size_records=affected_rows,
        task_id=str(explode_data_to_fact_table_task.request.id),
        tenant=msg[mk.TENANT_NAME],
        working_schema=conf[mk.TARGET_DB_SCHEMA],
        udl_leaf=True)
    benchmark.record_benchmark()
    return msg
def task(msg):
    start_time = datetime.datetime.now()
    logger.info("LOAD_SR_INTEGRATION_TO_TARGET: Migrating data from SR integration tables to target tables.")
    guid_batch = msg[mk.GUID_BATCH]
    load_type = msg[mk.LOAD_TYPE]

    source_tables = [Constants.UDL2_INTEGRATION_TABLE(load_type), Constants.UDL2_JSON_INTEGRATION_TABLE(load_type)]
    target_table = Constants.SR_TARGET_TABLE

    target_schema = msg[mk.TARGET_DB_SCHEMA] if mk.TARGET_DB_SCHEMA in msg else msg[mk.GUID_BATCH]
    conf = generate_conf(guid_batch, msg[mk.PHASE], load_type, msg[mk.TENANT_NAME], target_schema)
    affected_rows = move_data_from_int_tables_to_target_table(conf, task.name, source_tables, target_table)

    end_time = datetime.datetime.now()

    # benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, end_time, task_id=str(task.request.id),
                                    working_schema="", size_records=affected_rows[0], tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    notification_data = {mk.TOTAL_ROWS_LOADED: affected_rows[0]}
    merge_to_udl2stat_notification(guid_batch, notification_data)
    outgoing_msg = {}
    outgoing_msg.update(msg)
    outgoing_msg.update(notification_data)
    return outgoing_msg
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg.get(mk.GUID_BATCH)
    load_type = msg.get(mk.LOAD_TYPE)
    tenant_name = msg.get(mk.TENANT_NAME)
    tenant_directory_paths = msg.get(mk.TENANT_DIRECTORY_PATHS)
    expanded_dir = tenant_directory_paths.get(mk.EXPANDED)
    json_file = file_util.get_file_type_from_dir('.json', expanded_dir)
    logger.info('LOAD_JSON_TO_INTEGRATION: Loading json file <%s>' % json_file)
    conf = generate_conf_for_loading(json_file, guid_batch, load_type,
                                     tenant_name)
    affected_rows = load_json(conf)
    end_time = datetime.datetime.now()

    # record benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA],
                                    size_records=affected_rows,
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg.get(mk.GUID_BATCH)
    load_type = msg.get(mk.LOAD_TYPE)
    logger.info('FILE_CONTENT_VALIDATOR: Running Content validations for '
                'batch {guid_batch}'.format(guid_batch=guid_batch))
    end_time = datetime.datetime.now()
    errors = ContentValidator().execute(
        conf=get_content_validator_conf(guid_batch, load_type))

    if len(errors) == 0:
        logger.info(
            'FILE_CONTENT_VALIDATOR: Validated batch {guid_batch} '
            'and found no content errors.'.format(guid_batch=guid_batch))
    else:
        raise UDL2InvalidJSONCSVPairException(
            'Assessment guid mismatch between Json/Csv pair for '
            'batch {guid_batch}'.format(guid_batch=guid_batch))

    benchmark = BatchTableBenchmark(guid_batch,
                                    msg.get(mk.LOAD_TYPE),
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
def handle_deletions(msg):
    '''
    This is the celery task to match production database to find out deleted records in a batch
    exists.
    In batch, guid_batch is provided.
    '''
    logger.info('LOAD_FROM_INT_TO_STAR: Handle deletions in target tables.')
    start_time = datetime.datetime.now()
    guid_batch = msg[mk.GUID_BATCH]
    # pass down the affected row from previous stage
    udl_phase_step = 'HANDLE DELETION IN FACT'
    conf = _get_conf(msg)
    conf[mk.UDL_PHASE_STEP] = udl_phase_step

    # handle updates and deletes
    handle_updates_and_deletes(conf)
    finish_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    benchmark = BatchTableBenchmark(guid_batch, msg[mk.LOAD_TYPE], handle_deletions.name, start_time, finish_time,
                                    udl_phase_step=udl_phase_step, tenant=msg[mk.TENANT_NAME],
                                    task_id=str(handle_deletions.request.id), working_schema=conf[mk.TARGET_DB_SCHEMA])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file decrypter
    outgoing_msg = {}
    outgoing_msg.update(msg)
    return outgoing_msg
Ejemplo n.º 14
0
def task(msg):
    start_time = msg.get(mk.START_TIMESTAMP)
    end_time = datetime.datetime.now()
    load_type = msg.get(mk.LOAD_TYPE)
    guid_batch = msg.get(mk.GUID_BATCH)

    # infer overall pipeline_status based on previous pipeline_state
    pipeline_status = NotificationConstants.FAILURE if mk.PIPELINE_STATE in msg and msg.get(
        mk.PIPELINE_STATE) == 'error' else NotificationConstants.SUCCESS
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    'UDL_COMPLETE',
                                    start_time,
                                    end_time,
                                    udl_phase_step_status=pipeline_status,
                                    tenant=msg.get(mk.TENANT_NAME))
    benchmark.record_benchmark()

    # record batch stats to udl stats table
    # this will be used by migration script to move the data from pre-prod to prod
    report_batch_to_udl_stats(msg, end_time, pipeline_status)
    # report the batch metrics in Human readable format to the UDL log
    report_udl_batch_metrics_to_log(msg, end_time, pipeline_status)
    # update udl_stat for notification
    if pipeline_status == NotificationConstants.SUCCESS:
        merge_to_udl2stat_notification(
            guid_batch, {
                NotificationConstants.UDL_PHASE_STEP_STATUS: pipeline_status,
                NotificationConstants.UDL_PHASE: 'UDL_COMPLETE'
            })
    return msg
Ejemplo n.º 15
0
    def on_failure(self, exc, task_id, args, kwargs, einfo):
        logger.exception('Task failed: ' + self.name + ', task id: ' + task_id)
        msg = args[0]
        batch_guid = msg.get(mk.GUID_BATCH)
        load_type = msg.get(mk.LOAD_TYPE)
        failure_time = datetime.datetime.now()
        udl_phase_step = ''
        working_schema = ''
        if isinstance(exc, UDLException):
            udl_phase_step = exc.udl_phase_step
            working_schema = exc.working_schema
        benchmark = BatchTableBenchmark(
            batch_guid,
            load_type,
            udl_phase=self.name,
            start_timestamp=failure_time,
            end_timestamp=failure_time,
            udl_phase_step=udl_phase_step,
            working_schema=working_schema,
            udl_phase_step_status=Constants.FAILURE,
            task_id=str(self.request.id),
            error_desc=str(exc),
            stack_trace=einfo.traceback)
        benchmark.record_benchmark()

        # Write to udl stats table on exceptions
        update_udl_stats_by_batch_guid(batch_guid, {
            UdlStatsConstants.LOAD_STATUS:
            UdlStatsConstants.UDL_STATUS_FAILED
        })
        merge_to_udl2stat_notification(
            batch_guid, {
                Constants.UDL_PHASE_STEP_STATUS: Constants.FAILURE,
                Constants.ERROR_DESC: str(exc)
            })

        # Write to ERR_LIST
        try:
            exc.insert_err_list(failure_time)
        except Exception:
            pass
        err_msg = {}
        err_msg.update(msg)
        err_msg.update({mk.PIPELINE_STATE: 'error'})

        error_handler_chain = self.__get_pipeline_error_handler_chain(
            err_msg, self.name)
        if error_handler_chain is not None:
            error_handler_chain.delay()

        # Send email on exception
        email_info = {
            "task_id": task_id,
            "batch_guid": batch_guid,
            "load_type": load_type,
            "udl_phase": self.name,
            "udl_phase_step": udl_phase_step,
            "failure_time": failure_time
        }
        send_email_from_template(email_info)
Ejemplo n.º 16
0
def task(incoming_msg):
    """
    Celery task that handles clean-up of files created during the UDL process.
    This task currently will clean up work zone to remove all the files that were generated as part of this batch
    @param incoming_msg: the message received from the penultimate step in the UDL process. Contains all params needed
    """
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg.get(mk.GUID_BATCH)
    load_type = incoming_msg.get(mk.LOAD_TYPE)

    # do the cleanup
    post_etl.cleanup(incoming_msg)
    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    finish_time,
                                    task_id=str(task.request.id),
                                    tenant=incoming_msg.get(mk.TENANT_NAME))
    benchmark.record_benchmark()

    # Outgoing message to be piped to All Done
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    return outgoing_msg
Ejemplo n.º 17
0
def task(incoming_msg):
    """
    This is the celery task to expand the decrypted file
    """
    start_time = datetime.datetime.now()

    # Retrieve parameters from the incoming message
    file_to_expand = incoming_msg[mk.FILE_TO_EXPAND]
    guid_batch = incoming_msg[mk.GUID_BATCH]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expand_to_dir = tenant_directory_paths[mk.EXPANDED]
    load_type = incoming_msg[mk.LOAD_TYPE]

    logger.info('W_FILE_EXPANDER: expand file <%s> with guid_batch = <%s> to directory <%s>' % (file_to_expand, guid_batch, expand_to_dir))
    file_contents = expand_file(file_to_expand, expand_to_dir)
    logger.info('W_FILE_EXPANDER: expanded files:  <%s>' % (', '.join(file_contents)))

    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time, task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    return outgoing_msg
Ejemplo n.º 18
0
def task(incoming_msg):
    """
    This is the celery task for moving the file from arrivals to work/arrivals zone
    and creating all the folders needed for this batch run under work zone
    """
    start_time = datetime.datetime.now()
    # Retrieve parameters from the incoming message
    input_source_file = incoming_msg[mk.INPUT_FILE_PATH]
    guid_batch = incoming_msg[mk.GUID_BATCH]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_name = get_tenant_name(input_source_file)
    logger.info('W_FILE_ARRIVED: received file <%s> with guid_batch = <%s>' % (input_source_file, guid_batch))

    # Insert into udl stats
    udl_stats = {
        UdlStatsConstants.BATCH_GUID: guid_batch,
        UdlStatsConstants.LOAD_TYPE: load_type,
        UdlStatsConstants.FILE_ARRIVED: start_time,
        UdlStatsConstants.TENANT: tenant_name,
        UdlStatsConstants.LOAD_STATUS: UdlStatsConstants.UDL_STATUS_RECEIVED
    }
    insert_udl_stats(udl_stats)

    if not tenant_name:
        raise InvalidTenantNameException

    if not os.path.exists(input_source_file):
        raise FileNotFoundError
    input_file_size = os.path.getsize(input_source_file)

    # move the files to work and history zone
    # create all the folders needed for the current run inside work zone
    tenant_directory_paths = move_file_from_arrivals(input_source_file, guid_batch, tenant_name)
    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time,
                                    task_id=str(task.request.id), tenant=tenant_name, input_file=input_source_file)
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file decrypter
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    loc = input_source_file.rfind(Const.PROCESSING_FILE_EXT)
    org_file = input_source_file[:loc] if loc != -1 else input_source_file
    outgoing_msg.update({
        mk.INPUT_FILE_PATH: org_file,
        mk.INPUT_FILE_SIZE: input_file_size,
        mk.FILE_TO_DECRYPT: os.path.join(tenant_directory_paths[mk.ARRIVED],
                                         os.path.basename(org_file)),
        mk.TENANT_DIRECTORY_PATHS: tenant_directory_paths,
        mk.TENANT_NAME: tenant_name})

    return outgoing_msg
def task(incoming_msg):
    '''
    This is the celery task for splitting file
    '''
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg[mk.GUID_BATCH]
    parts = incoming_msg[mk.PARTS]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]
    csv_file = get_file_type_from_dir('.csv', expanded_dir)

    subfiles_dir = tenant_directory_paths[mk.SUBFILES]

    # do actual work of splitting file
    split_file_tuple_list, header_file_path, \
        totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir)

    finish_time = datetime.datetime.now()
    spend_time = finish_time - start_time

    logger.info(task.name)
    logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" %
                (csv_file, parts, spend_time))

    # Benchmark
    benchmark = BatchTableBenchmark(
        guid_batch,
        load_type,
        task.name,
        start_time,
        finish_time,
        size_records=totalrows,
        size_units=filesize,
        udl_phase_step_status=NotificationConstants.SUCCESS,
        task_id=str(task.request.id),
        tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    # Outgoing message to be piped to the parallel file loader
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({
        mk.SPLIT_FILE_LIST: split_file_tuple_list,
        mk.HEADER_FILE_PATH: header_file_path,
        mk.SIZE_RECORDS: totalrows
    })
    return outgoing_msg
def task(msg):
    start_time = datetime.datetime.now()
    logger.info(task.name)
    logger.info('LOAD_CSV_TO_STAGING: Loading file <%s> ' % (msg[mk.FILE_TO_LOAD]))
    guid_batch = msg[mk.GUID_BATCH]
    conf = generate_conf_for_loading(msg[mk.FILE_TO_LOAD], msg[mk.ROW_START], msg[mk.LOAD_TYPE],
                                     msg[mk.HEADERS], guid_batch, msg[mk.TENANT_NAME])
    load_file(conf)
    end_time = datetime.datetime.now()

    #Record benchmark
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH], msg[mk.LOAD_TYPE], task.name, start_time, end_time,
                                    task_id=str(task.request.id), working_schema=conf[mk.TARGET_DB_SCHEMA],
                                    udl_leaf=True, size_records=msg[mk.SIZE_RECORDS], tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    return msg
def prepare_target_schema(msg):
    """
    Task to create target star schema
    """
    start_time = datetime.datetime.now()
    conf = _get_conf(msg)
    tenant = conf[mk.TENANT_NAME]
    schema = conf[mk.TARGET_DB_SCHEMA]

    create_target_schema_for_batch(tenant, schema)

    end_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH], msg[mk.LOAD_TYPE], prepare_target_schema.name, start_time, end_time,
                                    task_id=str(prepare_target_schema.request.id), working_schema=schema, tenant=tenant)
    benchmark.record_benchmark()
    return msg
def handle_record_upsert(msg):
    '''
    Match records in current batch with production database, such that
    existing records only get updated to avoid duplication.
    '''
    logger.info('LOAD_FROM_INT_TO_STAR: detect duplications in target tables.')
    start_time = datetime.datetime.now()
    conf = _get_conf(msg)
    affected_rows = handle_duplicates_in_dimensions(conf[mk.TENANT_NAME], conf[mk.GUID_BATCH])
    finish_time = datetime.datetime.now()

    # Create benchmark object ant record benchmark
    udl_phase_step = 'Delete duplicate record in dim tables'
    benchmark = BatchTableBenchmark(msg[mk.GUID_BATCH], msg[mk.LOAD_TYPE], handle_record_upsert.name, start_time, finish_time,
                                    udl_phase_step=udl_phase_step, size_records=affected_rows, task_id=str(handle_record_upsert.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA], tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    return msg
Ejemplo n.º 23
0
def task(msg):
    start_time = datetime.datetime.now()
    logger.info(
        "LOAD_SR_INTEGRATION_TO_TARGET: Migrating data from SR integration tables to target tables."
    )
    guid_batch = msg[mk.GUID_BATCH]
    load_type = msg[mk.LOAD_TYPE]

    source_tables = [
        Constants.UDL2_INTEGRATION_TABLE(load_type),
        Constants.UDL2_JSON_INTEGRATION_TABLE(load_type)
    ]
    target_table = Constants.SR_TARGET_TABLE

    target_schema = msg[
        mk.TARGET_DB_SCHEMA] if mk.TARGET_DB_SCHEMA in msg else msg[
            mk.GUID_BATCH]
    conf = generate_conf(guid_batch, msg[mk.PHASE], load_type,
                         msg[mk.TENANT_NAME], target_schema)
    affected_rows = move_data_from_int_tables_to_target_table(
        conf, task.name, source_tables, target_table)

    end_time = datetime.datetime.now()

    # benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    working_schema="",
                                    size_records=affected_rows[0],
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    notification_data = {mk.TOTAL_ROWS_LOADED: affected_rows[0]}
    merge_to_udl2stat_notification(guid_batch, notification_data)
    outgoing_msg = {}
    outgoing_msg.update(msg)
    outgoing_msg.update(notification_data)
    return outgoing_msg
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg.get(mk.GUID_BATCH)
    load_type = msg.get(mk.LOAD_TYPE)
    tenant_name = msg.get(mk.TENANT_NAME)
    tenant_directory_paths = msg.get(mk.TENANT_DIRECTORY_PATHS)
    expanded_dir = tenant_directory_paths.get(mk.EXPANDED)
    json_file = file_util.get_file_type_from_dir('.json', expanded_dir)
    logger.info('LOAD_JSON_TO_INTEGRATION: Loading json file <%s>' % json_file)
    conf = generate_conf_for_loading(json_file, guid_batch, load_type, tenant_name)
    affected_rows = load_json(conf)
    end_time = datetime.datetime.now()

    # record benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, end_time,
                                    task_id=str(task.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA],
                                    size_records=affected_rows, tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
Ejemplo n.º 25
0
def task(incoming_msg):
    """
    This is the celery task to decrypt the source file
    """
    start_time = datetime.datetime.now()
    file_to_decrypt = incoming_msg[mk.FILE_TO_DECRYPT]
    passphrase = udl2_conf['passphrase']
    guid_batch = incoming_msg[mk.GUID_BATCH]
    gpghome = udl2_conf['gpg_home']
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    decrypt_to_dir = tenant_directory_paths[mk.DECRYPTED]
    load_type = incoming_msg[mk.LOAD_TYPE]

    logger.info(
        'W_FILE_DECRYPTER: received file {file} with guid_batch {guid_batch}'.
        format(file=file_to_decrypt, guid_batch=guid_batch))
    logger.info(
        'W_FILE_DECRYPTER: Decrypt to {dir}'.format(dir=decrypt_to_dir))

    status, decrypted_file = decrypt_file(file_to_decrypt, decrypt_to_dir,
                                          passphrase, gpghome)
    logger.info('Decrypted file: {file}'.format(file=decrypted_file))
    logger.info('Decryption status: {status}'.format(status=str(status)))

    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    finish_time,
                                    task_id=str(task.request.id),
                                    tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({mk.FILE_TO_EXPAND: decrypted_file})
    return outgoing_msg
Ejemplo n.º 26
0
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg[mk.GUID_BATCH]

    tenant_directory_paths = msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]

    notification = {}
    academic_year = get_academic_year_param(expanded_dir)
    outgoing_msg = {}
    outgoing_msg.update(msg)
    if msg[mk.LOAD_TYPE] == Constants.LOAD_TYPE_STUDENT_REGISTRATION:
        student_reg_guid, reg_system_id, callback_url, emailnotification = get_callback_params_for_studentregistration(expanded_dir)
        notification.update({NotificationConstants.STUDENT_REG_GUID: student_reg_guid})
        notification.update({NotificationConstants.REG_SYSTEM_ID: reg_system_id})
        notification.update({NotificationConstants.CALLBACK_URL: callback_url})
        notification.update({NotificationConstants.ACADEMIC_YEAR: academic_year})
        notification.update({NotificationConstants.EMAILNOTIFICATION: emailnotification})
        outgoing_msg.update({NotificationConstants.STUDENT_REG_GUID: student_reg_guid})
        outgoing_msg.update({NotificationConstants.REG_SYSTEM_ID: reg_system_id})
        outgoing_msg.update({NotificationConstants.CALLBACK_URL: callback_url})
        outgoing_msg.update({NotificationConstants.ACADEMIC_YEAR: academic_year})
    elif msg[mk.LOAD_TYPE] == Constants.LOAD_TYPE_ASSESSMENT:
        callback_url, emailnotification = get_callback_params_for_assessment(expanded_dir)
        notification.update({NotificationConstants.CALLBACK_URL: callback_url})
        notification.update({NotificationConstants.EMAILNOTIFICATION: emailnotification})

    logger.info('W_GET_CALLBACK_URL: Callback URL is <%s>' % callback_url)
    end_time = datetime.datetime.now()

    benchmark = BatchTableBenchmark(guid_batch, msg[mk.LOAD_TYPE], task.name, start_time, end_time, task_id=str(task.request.id), tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    #update udl_stat
    if notification:
        notification.update({NotificationConstants.NOTIFICATION_MAX_ATTEMPTS: udl2_conf[NotificationConstants.NOTIFICATION_MAX_ATTEMPTS]})
        notification.update({NotificationConstants.NOTIFICATION_RETRY_INTERVAL: udl2_conf[NotificationConstants.NOTIFICATION_RETRY_INTERVAL]})
        notification.update({NotificationConstants.NOTIFICATION_TIMEOUT_INTERVAL: udl2_conf[NotificationConstants.NOTIFICATION_TIMEOUT_INTERVAL]})
        update_udl_stats_by_batch_guid(guid_batch, {UdlStatsConstants.NOTIFICATION: json.dumps(notification)})

    return outgoing_msg
def explode_data_to_fact_table_task(msg, conf, source_table, fact_table, column_mapping, column_types):
    """
    This is the celery task to move data from one integration table to one fact table.
    :param conf:
    :param source_table:
    :param fact_table:
    :param column_mapping:
    :param column_types:
    :return:
    """
    logger.info('LOAD_FROM_INT_TO_STAR: migrating source table <%s> to <%s>' % (source_table, fact_table))
    start_time = datetime.datetime.now()
    affected_rows = explode_data_to_fact_table(conf, source_table, fact_table, column_mapping, column_types)
    finish_time = datetime.datetime.now()

    benchmark = BatchTableBenchmark(conf[mk.GUID_BATCH], conf[mk.LOAD_TYPE], explode_data_to_fact_table_task.name,
                                    start_time, finish_time, udl_phase_step='INT --> FACT:' + fact_table, size_records=affected_rows,
                                    task_id=str(explode_data_to_fact_table_task.request.id), tenant=msg[mk.TENANT_NAME],
                                    working_schema=conf[mk.TARGET_DB_SCHEMA], udl_leaf=True)
    benchmark.record_benchmark()
    return msg
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg.get(mk.GUID_BATCH)
    load_type = msg.get(mk.LOAD_TYPE)
    logger.info('FILE_CONTENT_VALIDATOR: Running Content validations for '
                'batch {guid_batch}'.format(guid_batch=guid_batch))
    end_time = datetime.datetime.now()
    errors = ContentValidator().execute(conf=get_content_validator_conf(guid_batch, load_type))

    if len(errors) == 0:
        logger.info('FILE_CONTENT_VALIDATOR: Validated batch {guid_batch} '
                    'and found no content errors.'.format(guid_batch=guid_batch))
    else:
        raise UDL2InvalidJSONCSVPairException('Assessment guid mismatch between Json/Csv pair for '
                                              'batch {guid_batch}'.format(guid_batch=guid_batch))

    benchmark = BatchTableBenchmark(guid_batch, msg.get(mk.LOAD_TYPE),
                                    task.name, start_time, end_time,
                                    task_id=str(task.request.id),
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
Ejemplo n.º 29
0
def task(msg):
    start_time = msg.get(mk.START_TIMESTAMP)
    end_time = datetime.datetime.now()
    load_type = msg.get(mk.LOAD_TYPE)
    guid_batch = msg.get(mk.GUID_BATCH)

    # infer overall pipeline_status based on previous pipeline_state
    pipeline_status = NotificationConstants.FAILURE if mk.PIPELINE_STATE in msg and msg.get(mk.PIPELINE_STATE) == 'error' else NotificationConstants.SUCCESS
    benchmark = BatchTableBenchmark(guid_batch, load_type, 'UDL_COMPLETE',
                                    start_time, end_time, udl_phase_step_status=pipeline_status,
                                    tenant=msg.get(mk.TENANT_NAME))
    benchmark.record_benchmark()

    # record batch stats to udl stats table
    # this will be used by migration script to move the data from pre-prod to prod
    report_batch_to_udl_stats(msg, end_time, pipeline_status)
    # report the batch metrics in Human readable format to the UDL log
    report_udl_batch_metrics_to_log(msg, end_time, pipeline_status)
    # update udl_stat for notification
    if pipeline_status == NotificationConstants.SUCCESS:
        merge_to_udl2stat_notification(guid_batch, {NotificationConstants.UDL_PHASE_STEP_STATUS: pipeline_status, NotificationConstants.UDL_PHASE: 'UDL_COMPLETE'})
    return msg
def explode_data_to_dim_table_task(msg, conf, source_table, dim_table, column_mapping, column_types):
    """
    This is the celery task to move data from one integration table to one dim table.
    :param conf:
    :param source_table:
    :param dim_table:
    :param column_mapping:
    :param column_types:
    :return:
    """
    logger.info('LOAD_FROM_INT_TO_STAR: migrating source table <%s> to <%s>' % (source_table, dim_table))
    start_time = datetime.datetime.now()
    affected_rows = explode_data_to_dim_table(conf, source_table, dim_table, column_mapping, column_types)
    finish_time = datetime.datetime.now()
    _time_as_seconds = calculate_spend_time_as_second(start_time, finish_time)

    # Create benchmark object ant record benchmark
    udl_phase_step = 'INT --> DIM:' + dim_table
    benchmark = BatchTableBenchmark(conf[mk.GUID_BATCH], conf[mk.LOAD_TYPE], explode_data_to_dim_table_task.name, start_time, finish_time,
                                    udl_phase_step=udl_phase_step, size_records=affected_rows[0], task_id=str(explode_data_to_dim_table_task.request.id),
                                    working_schema=conf[mk.TARGET_DB_SCHEMA], udl_leaf=True, tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    return msg
def task(msg):
    start_time = datetime.datetime.now()
    logger.info("LOAD_FROM_STAGING_TO_INT: Migrating data from staging to integration.")
    guid_batch = msg[mk.GUID_BATCH]
    conf = generate_conf(guid_batch, msg[mk.LOAD_TYPE])
    success_rows, fail_rows = move_data_from_staging_to_integration(conf)
    end_time = datetime.datetime.now()

    notification_data = {mk.TOTAL_ROWS_LOADED: success_rows, mk.TOTAL_ROWS_NOT_LOADED: fail_rows}
    merge_to_udl2stat_notification(guid_batch, notification_data)
    if success_rows is 0:
        raise UDL2DataLoadingException()

    # benchmark
    benchmark = BatchTableBenchmark(guid_batch, msg[mk.LOAD_TYPE], task.name, start_time, end_time, size_records=success_rows,
                                    task_id=str(task.request.id), working_schema=conf[mk.TARGET_DB_SCHEMA], tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(msg)
    outgoing_msg.update({mk.PHASE: 4, mk.TOTAL_ROWS_LOADED: success_rows, mk.TOTAL_ROWS_NOT_LOADED: fail_rows})
    return outgoing_msg
Ejemplo n.º 32
0
def task(incoming_msg):
    '''
    This is the celery task for splitting file
    '''
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg[mk.GUID_BATCH]
    parts = incoming_msg[mk.PARTS]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]
    csv_file = get_file_type_from_dir('.csv', expanded_dir)

    subfiles_dir = tenant_directory_paths[mk.SUBFILES]

    # do actual work of splitting file
    split_file_tuple_list, header_file_path, \
        totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir)

    finish_time = datetime.datetime.now()
    spend_time = finish_time - start_time

    logger.info(task.name)
    logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" % (csv_file, parts, spend_time))

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time,
                                    size_records=totalrows, size_units=filesize, udl_phase_step_status=NotificationConstants.SUCCESS,
                                    task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    # Outgoing message to be piped to the parallel file loader
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({mk.SPLIT_FILE_LIST: split_file_tuple_list,
                         mk.HEADER_FILE_PATH: header_file_path,
                         mk.SIZE_RECORDS: totalrows
                         })
    return outgoing_msg
Ejemplo n.º 33
0
def task(incoming_msg):
    """
    This is the celery task to expand the decrypted file
    """
    start_time = datetime.datetime.now()

    # Retrieve parameters from the incoming message
    file_to_expand = incoming_msg[mk.FILE_TO_EXPAND]
    guid_batch = incoming_msg[mk.GUID_BATCH]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expand_to_dir = tenant_directory_paths[mk.EXPANDED]
    load_type = incoming_msg[mk.LOAD_TYPE]

    logger.info(
        'W_FILE_EXPANDER: expand file <%s> with guid_batch = <%s> to directory <%s>'
        % (file_to_expand, guid_batch, expand_to_dir))
    file_contents = expand_file(file_to_expand, expand_to_dir)
    logger.info('W_FILE_EXPANDER: expanded files:  <%s>' %
                (', '.join(file_contents)))

    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    finish_time,
                                    task_id=str(task.request.id),
                                    tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file expander
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    return outgoing_msg
Ejemplo n.º 34
0
def task(msg):
    start_time = datetime.datetime.now()
    guid_batch = msg[mk.GUID_BATCH]

    tenant_directory_paths = msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]

    notification = {}
    academic_year = get_academic_year_param(expanded_dir)
    outgoing_msg = {}
    outgoing_msg.update(msg)
    if msg[mk.LOAD_TYPE] == Constants.LOAD_TYPE_STUDENT_REGISTRATION:
        student_reg_guid, reg_system_id, callback_url, emailnotification = get_callback_params_for_studentregistration(
            expanded_dir)
        notification.update(
            {NotificationConstants.STUDENT_REG_GUID: student_reg_guid})
        notification.update(
            {NotificationConstants.REG_SYSTEM_ID: reg_system_id})
        notification.update({NotificationConstants.CALLBACK_URL: callback_url})
        notification.update(
            {NotificationConstants.ACADEMIC_YEAR: academic_year})
        notification.update(
            {NotificationConstants.EMAILNOTIFICATION: emailnotification})
        outgoing_msg.update(
            {NotificationConstants.STUDENT_REG_GUID: student_reg_guid})
        outgoing_msg.update(
            {NotificationConstants.REG_SYSTEM_ID: reg_system_id})
        outgoing_msg.update({NotificationConstants.CALLBACK_URL: callback_url})
        outgoing_msg.update(
            {NotificationConstants.ACADEMIC_YEAR: academic_year})
    elif msg[mk.LOAD_TYPE] == Constants.LOAD_TYPE_ASSESSMENT:
        callback_url, emailnotification = get_callback_params_for_assessment(
            expanded_dir)
        notification.update({NotificationConstants.CALLBACK_URL: callback_url})
        notification.update(
            {NotificationConstants.EMAILNOTIFICATION: emailnotification})

    logger.info('W_GET_CALLBACK_URL: Callback URL is <%s>' % callback_url)
    end_time = datetime.datetime.now()

    benchmark = BatchTableBenchmark(guid_batch,
                                    msg[mk.LOAD_TYPE],
                                    task.name,
                                    start_time,
                                    end_time,
                                    task_id=str(task.request.id),
                                    tenant=msg[mk.TENANT_NAME])
    benchmark.record_benchmark()

    #update udl_stat
    if notification:
        notification.update({
            NotificationConstants.NOTIFICATION_MAX_ATTEMPTS:
            udl2_conf[NotificationConstants.NOTIFICATION_MAX_ATTEMPTS]
        })
        notification.update({
            NotificationConstants.NOTIFICATION_RETRY_INTERVAL:
            udl2_conf[NotificationConstants.NOTIFICATION_RETRY_INTERVAL]
        })
        notification.update({
            NotificationConstants.NOTIFICATION_TIMEOUT_INTERVAL:
            udl2_conf[NotificationConstants.NOTIFICATION_TIMEOUT_INTERVAL]
        })
        update_udl_stats_by_batch_guid(
            guid_batch,
            {UdlStatsConstants.NOTIFICATION: json.dumps(notification)})

    return outgoing_msg
Ejemplo n.º 35
0
def task(incoming_msg):
    """
    This is the celery task for moving the file from arrivals to work/arrivals zone
    and creating all the folders needed for this batch run under work zone
    """
    start_time = datetime.datetime.now()
    # Retrieve parameters from the incoming message
    input_source_file = incoming_msg[mk.INPUT_FILE_PATH]
    guid_batch = incoming_msg[mk.GUID_BATCH]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_name = get_tenant_name(input_source_file)
    logger.info('W_FILE_ARRIVED: received file <%s> with guid_batch = <%s>' %
                (input_source_file, guid_batch))

    # Insert into udl stats
    udl_stats = {
        UdlStatsConstants.BATCH_GUID: guid_batch,
        UdlStatsConstants.LOAD_TYPE: load_type,
        UdlStatsConstants.FILE_ARRIVED: start_time,
        UdlStatsConstants.TENANT: tenant_name,
        UdlStatsConstants.LOAD_STATUS: UdlStatsConstants.UDL_STATUS_RECEIVED
    }
    insert_udl_stats(udl_stats)

    if not tenant_name:
        raise InvalidTenantNameException

    if not os.path.exists(input_source_file):
        raise FileNotFoundError
    input_file_size = os.path.getsize(input_source_file)

    # move the files to work and history zone
    # create all the folders needed for the current run inside work zone
    tenant_directory_paths = move_file_from_arrivals(input_source_file,
                                                     guid_batch, tenant_name)
    finish_time = datetime.datetime.now()

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch,
                                    load_type,
                                    task.name,
                                    start_time,
                                    finish_time,
                                    task_id=str(task.request.id),
                                    tenant=tenant_name,
                                    input_file=input_source_file)
    benchmark.record_benchmark()

    # Outgoing message to be piped to the file decrypter
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    loc = input_source_file.rfind(Const.PROCESSING_FILE_EXT)
    org_file = input_source_file[:loc] if loc != -1 else input_source_file
    outgoing_msg.update({
        mk.INPUT_FILE_PATH:
        org_file,
        mk.INPUT_FILE_SIZE:
        input_file_size,
        mk.FILE_TO_DECRYPT:
        os.path.join(tenant_directory_paths[mk.ARRIVED],
                     os.path.basename(org_file)),
        mk.TENANT_DIRECTORY_PATHS:
        tenant_directory_paths,
        mk.TENANT_NAME:
        tenant_name
    })

    return outgoing_msg