Example #1
0
def ingest(hdfsfile, file_no, datafolders):
    """

    :type hdfsfiles: str
    :type config: config.Config
    """
    try:
        process_info = dict()
        process_info['process_start_timestamp'] = datetime.utcnow().strftime(
            '%Y-%m-%d %H:%M:%S')
        process_info['user_name'] = getpass.getuser()
        process_info['file_name'] = os.path.basename(hdfsfile)
        process_info['file_no'] = file_no
        #spark = HiveContext(spark.sparkContext)
        file_config = getFileConfig(hdfsfile, config)

        if not file_config:
            err_msg = "can not find matched file configuration for file %s!" % hdfsfile
            logger.error(err_msg)
            return 'Error - ' + err_msg

        metadata = Metadata(sc)

        if hdfsutil.checkDuplicate(config, file_config, hdfsfile):
            hdfsutil.move_to_error_archive(config, file_config, hdfsfile,
                                           process_info)
            metadata.log_error_table(spark, file_config, config, process_info,
                                     "duplicate file")
            return 'Error - a file with same name already exists in archive folder!' + \
                   ' Moved to error_archive "' + process_info['error_archive_path'] + '".'

        metadata.loadKVFile(spark, config, file_config.get('kv_file'),
                            process_info)

        reader = Reader(spark)
        df = reader.read(hdfsfile, file_config, metadata, process_info)

        process_info['row_count'] = df.count()

        logger.info("row count in file %s is %d" %
                    (hdfsfile, process_info['row_count']))

        if process_info['row_count'] == 0:
            if file_config.get('empty_check', "no").lower() == "yes":
                hdfsutil.move_to_error_archive(config, file_config, hdfsfile,
                                               process_info)
                metadata.log_error_table(spark, file_config, config,
                                         process_info, "file is empty")
                return 'Error - Empty File!' + \
                    ' Moved to error_archive "' + process_info['error_archive_path'] + '".'
            else:
                hdfsutil.move_to_archive(config, file_config, hdfsfile)
                logger.warn('%s is empty!' % hdfsfile)
                return 'Success'

        validator = Validator(sc)
        validator.val_column_num(df.columns, metadata.data_types, process_info)
        val_error = validator.get_error()
        if val_error:
            hdfsutil.move_to_error_archive(config, file_config, hdfsfile,
                                           process_info)
            metadata.log_error_table(spark, file_config, config, process_info,
                                     val_error)
            return 'Error - ' + val_error + \
                ' Moved to error_archive "' + process_info['error_archive_path'] + '".'

        transformer = Transformer(sc)
        df = transformer.trans_data_types(spark, df, file_config, metadata,
                                          process_info)
        df = validator.val_data_types(spark, df, file_config, metadata,
                                      process_info)

        transformedColumns = [col for col in df.columns if col[:2] == '__']

        writer = Writer(sc)
        val_error = validator.get_error()
        if val_error:
            error_df = df.where('length(_error_message) > 0').drop(
                *transformedColumns)
            writer.write_errorfile(error_df, config, file_config, process_info)

            df.unpersist()
            logger.error('file %s failed at data type validation' % hdfsfile)
            hdfsutil.move_to_error_archive(config, file_config, hdfsfile,
                                           process_info)
            metadata.log_error_table(spark, file_config, config, process_info,
                                     val_error)
            return 'Error - ' + val_error + \
                ' Moved to error_archive "' + process_info['error_archive_path'] + '".' + \
                ' Error file path "' + process_info['error_file_path'] + '".'
        else:
            orig_columns = [col[1:] for col in transformedColumns]
            data_df = df.drop(*orig_columns) \
                .drop('_error_message') \
                .withColumn('source_filename', lit(os.path.basename(hdfsfile))) \
                .withColumn('process_timestamp', to_timestamp(lit(process_info['process_start_timestamp']), 'yyyy-MM-dd HH:mm:ss'))

            writer.write_orc(data_df, spark, config, metadata, process_info)
            datafolders.add(process_info['hdfs_datafile_path'])

            df.unpersist()
            hdfsutil.move_to_archive(config, file_config, hdfsfile)
            process_info['process_end_timestamp'] = datetime.utcnow().strftime(
                '%Y-%m-%d %H:%M:%S')
            metadata.log_registry_table(spark, file_config, config,
                                        process_info)
            logger.info('file %s has been successfully ingested' % hdfsfile)
            return 'Success'
    except Exception as e:
        logger.error('file %s ingestion failed! exception is %s' %
                     (hdfsfile, str(e)))
        return 'Failed - Exception happened! Please see Yarn log for details'