Esempio n. 1
0
def get_files_to_sample(config, s3_files, max_files):
    """
    Returns the list of files for sampling, it checks the s3_files whether any zip or gz file exists or not
    if exists then extract if and append in the list of files

    Args:
        config dict(): Configuration
        s3_files list(): List of S3 Bucket files
    Returns:
        list(dict()) : List of Files for sampling
             |_ s3_path str(): S3 Bucket File path
             |_ file_handle StreamingBody(): file object
             |_ type str(): Type of file which is used for extracted file
             |_ extension str(): extension of file (for normal files only)
    """
    global skipped_files_count
    sampled_files = []

    OTHER_FILES = ["csv", "gz", "jsonl", "txt"]

    for s3_file in s3_files:
        file_key = s3_file.get('key')

        if len(sampled_files) >= max_files:
            break

        if file_key:
            file_name = file_key.split("/").pop()
            extension = file_name.split(".").pop().lower()
            file_handle = get_file_handle(config, file_key)

            # Check whether file is without extension or not
            if not extension or file_name.lower() == extension:
                LOGGER.warning(
                    '"%s" without extension will not be sampled.', file_key)
                skipped_files_count = skipped_files_count + 1
            elif file_key.endswith(".tar.gz"):
                LOGGER.warning(
                    'Skipping "%s" file as .tar.gz extension is not supported', file_key)
                skipped_files_count = skipped_files_count + 1
            elif extension == "zip":
                files = compression.infer(
                    io.BytesIO(file_handle.read()), file_name)

                # Add only those extracted files which are supported by tap
                # Prepare dictionary contains the zip file name, type i.e. unzipped and file object of extracted file
                sampled_files.extend([{"type": "unzipped", "s3_path": file_key, "file_handle": de_file} for de_file in files if de_file.name.split(
                    ".")[-1].lower() in OTHER_FILES and not de_file.name.endswith(".tar.gz")])
            elif extension in OTHER_FILES:
                # Prepare dictionary contains the s3 file path, extension of file and file object
                sampled_files.append(
                    {"s3_path": file_key, "file_handle": file_handle, "extension": extension})
            else:
                LOGGER.warning(
                    '"%s" having the ".%s" extension will not be sampled.', file_key, extension)
                skipped_files_count = skipped_files_count + 1

    return sampled_files
Esempio n. 2
0
def sync_compressed_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing Compressed file "%s".', s3_path)

    records_streamed = 0
    s3_file_handle = s3.get_file_handle(config, s3_path)

    decompressed_files = compression.infer(io.BytesIO(s3_file_handle.read()), s3_path)

    for decompressed_file in decompressed_files:
        extension = decompressed_file.name.split(".")[-1].lower()

        if extension in ["csv", "jsonl", "gz", "txt"]:
            # Append the extracted file name with zip file.
            s3_file_path = s3_path + "/" + decompressed_file.name

            records_streamed += handle_file(config, s3_file_path, table_spec, stream, extension, file_handler=decompressed_file)

    return records_streamed