from pyspark.sql import SparkSession from metadata import Metadata import config spark = SparkSession \ .builder \ .enableHiveSupport() \ .appName("ingestion") \ .master("local[2]") \ .getOrCreate() sc = spark.sparkContext config = config.Config(sc, "../conf/config_ta_asia_test.json") metadata = Metadata(sc) metadata.loadKVFile(spark, config, "metadata_etf_transactions.kv") process_info = {} reader = Reader(spark) file = '/test/IIC_MPower_EntriesJul22-19.xls' file_config = config.get('files')[3] print(config.get('source')) print(file_config.get('topic')) df = reader.read(file, file_config, metadata, process_info) print(df.count()) print(df.show(1000))
def ingest(hdfsfile, file_no, datafolders): """ :type hdfsfiles: str :type config: config.Config """ try: process_info = dict() process_info['process_start_timestamp'] = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') process_info['user_name'] = getpass.getuser() process_info['file_name'] = os.path.basename(hdfsfile) process_info['file_no'] = file_no #spark = HiveContext(spark.sparkContext) file_config = getFileConfig(hdfsfile, config) if not file_config: err_msg = "can not find matched file configuration for file %s!" % hdfsfile logger.error(err_msg) return 'Error - ' + err_msg metadata = Metadata(sc) if hdfsutil.checkDuplicate(config, file_config, hdfsfile): hdfsutil.move_to_error_archive(config, file_config, hdfsfile, process_info) metadata.log_error_table(spark, file_config, config, process_info, "duplicate file") return 'Error - a file with same name already exists in archive folder!' + \ ' Moved to error_archive "' + process_info['error_archive_path'] + '".' metadata.loadKVFile(spark, config, file_config.get('kv_file'), process_info) reader = Reader(spark) df = reader.read(hdfsfile, file_config, metadata, process_info) process_info['row_count'] = df.count() logger.info("row count in file %s is %d" % (hdfsfile, process_info['row_count'])) if process_info['row_count'] == 0: if file_config.get('empty_check', "no").lower() == "yes": hdfsutil.move_to_error_archive(config, file_config, hdfsfile, process_info) metadata.log_error_table(spark, file_config, config, process_info, "file is empty") return 'Error - Empty File!' + \ ' Moved to error_archive "' + process_info['error_archive_path'] + '".' else: hdfsutil.move_to_archive(config, file_config, hdfsfile) logger.warn('%s is empty!' % hdfsfile) return 'Success' validator = Validator(sc) validator.val_column_num(df.columns, metadata.data_types, process_info) val_error = validator.get_error() if val_error: hdfsutil.move_to_error_archive(config, file_config, hdfsfile, process_info) metadata.log_error_table(spark, file_config, config, process_info, val_error) return 'Error - ' + val_error + \ ' Moved to error_archive "' + process_info['error_archive_path'] + '".' transformer = Transformer(sc) df = transformer.trans_data_types(spark, df, file_config, metadata, process_info) df = validator.val_data_types(spark, df, file_config, metadata, process_info) transformedColumns = [col for col in df.columns if col[:2] == '__'] writer = Writer(sc) val_error = validator.get_error() if val_error: error_df = df.where('length(_error_message) > 0').drop( *transformedColumns) writer.write_errorfile(error_df, config, file_config, process_info) df.unpersist() logger.error('file %s failed at data type validation' % hdfsfile) hdfsutil.move_to_error_archive(config, file_config, hdfsfile, process_info) metadata.log_error_table(spark, file_config, config, process_info, val_error) return 'Error - ' + val_error + \ ' Moved to error_archive "' + process_info['error_archive_path'] + '".' + \ ' Error file path "' + process_info['error_file_path'] + '".' else: orig_columns = [col[1:] for col in transformedColumns] data_df = df.drop(*orig_columns) \ .drop('_error_message') \ .withColumn('source_filename', lit(os.path.basename(hdfsfile))) \ .withColumn('process_timestamp', to_timestamp(lit(process_info['process_start_timestamp']), 'yyyy-MM-dd HH:mm:ss')) writer.write_orc(data_df, spark, config, metadata, process_info) datafolders.add(process_info['hdfs_datafile_path']) df.unpersist() hdfsutil.move_to_archive(config, file_config, hdfsfile) process_info['process_end_timestamp'] = datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') metadata.log_registry_table(spark, file_config, config, process_info) logger.info('file %s has been successfully ingested' % hdfsfile) return 'Success' except Exception as e: logger.error('file %s ingestion failed! exception is %s' % (hdfsfile, str(e))) return 'Failed - Exception happened! Please see Yarn log for details'