def _compute_md5_hash(self): """Method to compute the MD5 hash for the filename.""" self._logger.info('Computing MD5 hash of the input file...') with open(self._filename, 'rb') as f: md5 = compute_md5_hash(f) self._logger.info('Computed MD5 hash of the input file') metadata.add_optional_job_metadata(self._metadata_conn, 'dirbs-import', self.import_id, input_file_md5=md5)
def _populate_file_properties(config, file_list, run_id, perform_prevalidation, logger): """Determine the attributes associated with the file.""" uncataloged_files = [] for file_name in file_list: file_properties = file_name['file_properties'] file_path = file_name['file_path'] is_valid_zip = None is_valid_format = None files_to_delete = [] num_records = None uncompressed_size_bytes = None try: # Validate zip file extracted_file = extract_csv_from_zip(file_path) is_valid_zip = True if perform_prevalidation: is_valid_format = _prevalidate_file(config, extracted_file, file_path, file_properties.file_type, run_id, file_name['schema'], files_to_delete, logger) num_records = sum(1 for _ in extracted_file) with zipfile.ZipFile(file_path) as file_test: uncompressed_size_bytes = file_test.getinfo( extracted_file.name).file_size except BadZipFile as err: is_valid_zip = False logger.warn('The zip file is invalid: {0}'.format( file_properties.filename)) logger.warn('Zip check error: {0}'.format(str(err))) except exceptions.PrevalidationCheckRawException as err: is_valid_format = False logger.warn( 'Pre-validation failed for file: {0} with error: {1}'.format( file_path, str(err))) finally: logger.debug('Cleanup: deleting intermediate data files...') for fn in files_to_delete: logger.debug('Deleted intermediate file {0}'.format(fn)) remove(fn) logger.debug('Cleanup: deleted intermediate data files') # Compute MD5 hash logger.info('Computing MD5 hash of the input file...') with open(file_path, 'rb') as f: md5 = compute_md5_hash(f) logger.info('Computed MD5 hash') # Fetch extra attributes (if any) extra_attributes = _get_extra_attributes(file_path, file_properties.file_type, logger) file_attributes = CatalogAttributes( file_properties.filename, file_properties.file_type, file_properties.modified_time, file_properties.compressed_size_bytes, is_valid_zip, is_valid_format, md5, extra_attributes, uncompressed_size_bytes, num_records) uncataloged_files.append(file_attributes) return uncataloged_files