Exemple #1
0
    def _file_to_split(self):
        """Function to return file handle for the file to split."""
        file_to_split = None
        if self._extract:
            try:
                file_to_split = extract_csv_from_zip(self._filename)
                self._perform_filename_checks(file_to_split.name)
            except zipfile.BadZipFile as e:
                raise exceptions.ZipFileCheckException(
                    str(e),
                    statsd=self._statsd,
                    metrics_failures_root=self._metrics_failures_root)
        else:
            self._perform_filename_checks(self._filename)
            file_to_split = open(self._filename, 'rb')

        return file_to_split
Exemple #2
0
def _populate_file_properties(config, file_list, run_id, perform_prevalidation,
                              logger):
    """Determine the attributes associated with the file."""
    uncataloged_files = []
    for file_name in file_list:
        file_properties = file_name['file_properties']
        file_path = file_name['file_path']
        is_valid_zip = None
        is_valid_format = None
        files_to_delete = []
        num_records = None
        uncompressed_size_bytes = None
        try:
            # Validate zip file
            extracted_file = extract_csv_from_zip(file_path)
            is_valid_zip = True
            if perform_prevalidation:
                is_valid_format = _prevalidate_file(config, extracted_file,
                                                    file_path,
                                                    file_properties.file_type,
                                                    run_id,
                                                    file_name['schema'],
                                                    files_to_delete, logger)
            num_records = sum(1 for _ in extracted_file)
            with zipfile.ZipFile(file_path) as file_test:
                uncompressed_size_bytes = file_test.getinfo(
                    extracted_file.name).file_size

        except BadZipFile as err:
            is_valid_zip = False
            logger.warn('The zip file is invalid: {0}'.format(
                file_properties.filename))
            logger.warn('Zip check error: {0}'.format(str(err)))
        except exceptions.PrevalidationCheckRawException as err:
            is_valid_format = False
            logger.warn(
                'Pre-validation failed for file: {0} with error: {1}'.format(
                    file_path, str(err)))
        finally:
            logger.debug('Cleanup: deleting intermediate data files...')
            for fn in files_to_delete:
                logger.debug('Deleted intermediate file {0}'.format(fn))
                remove(fn)
            logger.debug('Cleanup: deleted intermediate data files')

        # Compute MD5 hash
        logger.info('Computing MD5 hash of the input file...')
        with open(file_path, 'rb') as f:
            md5 = compute_md5_hash(f)
        logger.info('Computed MD5 hash')

        # Fetch extra attributes (if any)
        extra_attributes = _get_extra_attributes(file_path,
                                                 file_properties.file_type,
                                                 logger)
        file_attributes = CatalogAttributes(
            file_properties.filename, file_properties.file_type,
            file_properties.modified_time,
            file_properties.compressed_size_bytes, is_valid_zip,
            is_valid_format, md5, extra_attributes, uncompressed_size_bytes,
            num_records)
        uncataloged_files.append(file_attributes)
    return uncataloged_files