Esempio n. 1
0
        def print_download_progress_report(num_downloaded):
            self.verbose_print()
            byte_total = sum(trailing_50_file_bytes)
            download_progress_message = 'Download Progress Report [{}]: \n    {}/{} queued files downloaded so far. ' \
                .format(datetime.datetime.now().strftime('%b %d %Y %H:%M:%S'), num_downloaded, download_request_count)
            download_progress_message += '\n    Last 50 files contained ~ {} bytes and finished in {} (Hours:Minutes:Seconds). ' \
                .format(Utils.human_size(byte_total), str(datetime.datetime.now() - trailing_50_timestamp[0]).split('.')[0])

            seconds_last_50_files = (datetime.datetime.now() - trailing_50_timestamp[0]).seconds
            if seconds_last_50_files == 0:
                seconds_last_50_files = 1  # avoid a 'division by 0' error

            # convert download speed to bits per second
            avg_speed_bps = Utils.human_size((8 * byte_total) // seconds_last_50_files)
            if avg_speed_bps[-1:] == 'B':
                avg_speed_bps = avg_speed_bps.replace('B','bps')
            else:
                avg_speed_bps = avg_speed_bps.replace('bytes','bps')

            download_progress_message += '\n    Avg download rate (in bits per second) for the last 50 files is ~ {}.' \
                .format(avg_speed_bps)

            download_progress_message += '\n    Download has been in progress for {} (Hours:Minutes:Seconds).\n' \
                .format(str(datetime.datetime.now() - download_start_date).split('.')[0])

            self.verbose_print(download_progress_message)
            trailing_50_file_bytes.clear()
            trailing_50_timestamp[0] = datetime.datetime.now()
            self.verbose_print()
Esempio n. 2
0
 def test_match(key):
     # None gets converted to empty string.
     # Convert it back if empty string is detected so that None values can be compared using == operator
     val1 = possible_match[key] or None
     # values from download_job_manifest_column_defs will never be None, instead they will be an empty string ''
     val2 = self.download_job_manifest_column_defs[key]
     if key == 'download_directory':
         val2 = Utils.convert_to_abs_path(val2)
         val1 = Utils.convert_to_abs_path(val1)
     elif key == 's3_links_file':
         # only convert to basename if values are specified for both
         if val2 and val1:
             val2 = os.path.basename(val2)
             val1 = os.path.basename(val1)
     return val1 == val2
Esempio n. 3
0
 def get_temp_creds_for_file(self, package_file_id, custom_user_s3_endpoint=None):
     url = self.package_url + '/{}/files/{}/download_token'.format(self.package_id, package_file_id)
     if custom_user_s3_endpoint:
         s3_dest_bucket, s3_dest_prefix = Utils.deconstruct_s3_url(custom_user_s3_endpoint)
         url += '?s3SourceBucket={}'.format(s3_dest_bucket)
         if s3_dest_prefix:
             url += '&s3SourcePrefix={}'.format(s3_dest_prefix)
     tmp = get_request(url, headers=self.request_header(), auth=self.auth)
     return json.loads(tmp.text)
Esempio n. 4
0
    def write_to_failed_download_link_file(self, failed_s3_links_file, s3_link, source_uri):
        src_bucket, src_path = Utils.deconstruct_s3_url(s3_link if s3_link else source_uri)
        s3_address = 's3://' + src_bucket + '/' + src_path

        with self.package_file_download_errors_lock:
            self.package_file_download_errors.add(s3_address)
            if failed_s3_links_file:
                failed_s3_links_file.write(s3_address + "\n")
                failed_s3_links_file.flush()
Esempio n. 5
0
        def add_files_to_report(download_progress_report_path, verification_report_path, probably_missing_files_list):
            copyfile(download_progress_report_path, verification_report_path)

            all_records = []
            for file_id in probably_missing_files_list:
                file_info = self.local_file_names[file_id]
                record = copy.deepcopy(self.download_job_progress_report_column_defs)
                # TODO - consider making these the same names
                record['package_file_expected_location'] = file_info['download_alias']
                record['expected_file_size'] = file_info['file_size']
                record['package_file_id'] = file_id
                download_path = os.path.join(self.package_download_directory,
                                             self.local_file_names[file_id]['download_alias'])
                if os.path.exists(download_path):
                    record['exists'] = True
                    stat = os.stat(download_path)
                    record['actual_file_size'] = stat.st_size
                    if record['actual_file_size'] == record['expected_file_size']:
                        record['download_complete_time'] = time.strftime("%Y%m%dT%H%M%S", time.localtime(stat.st_ctime))

                all_records.append(record)

            # batch requests to reduce overhead
            # TODO - add s3 location to file resource in order to eliminate this step?
            print('Retrieving s3 url information for {} nda file records'.format(len(all_records)))
            batch_size = 1000
            batches = [all_records[i:i + batch_size] for i in range(0, len(all_records), batch_size)]
            for batch in batches:
                result = self.get_presigned_urls([record['package_file_id'] for record in batch])
                # result is a dictionary of package-file-id to presignedUrl
                for record in batch:
                    ps_url = result[record['package_file_id']]
                    dest_bucket, dest_path = Utils.deconstruct_s3_url(ps_url)
                    record['nda_s3_url'] = 's3://{}/{}'.format(dest_bucket, dest_path)

            with open(verification_report_path, 'a', newline='') as verification_report:
                download_progress_report_writer = csv.DictWriter(verification_report,
                                                                 fieldnames=self.download_job_progress_report_column_defs)
                download_progress_report_writer.writerows(all_records)

            return [record['nda_s3_url'] for record in all_records if
                    record['actual_file_size'] < record['expected_file_size']]
Esempio n. 6
0
    def __init__(self, download_config, args):

        # Instance variables from config
        self.config = download_config
        self.url = self.config.datamanager_api
        self.package_url = self.config.package_api
        self.datadictionary_url = self.config.datadictionary_api
        self.username = download_config.username
        self.password = download_config.password
        self.auth = requests.auth.HTTPBasicAuth(self.config.username, self.config.password)

        # Instance Variables from 'args'
        if args.directory:
            download_directory = args.directory[0]
        elif args.resume:
            download_directory = args.resume[0]
        else:
            download_directory = os.path.join(NDATools.NDA_TOOLS_DOWNLOADS_FOLDER, str(args.package))
        self.downloadcmd_package_metadata_directory = os.path.join(NDATools.NDA_TOOLS_DOWNLOADS_FOLDER,
                                                                   str(args.package))
        self.package_download_directory = Utils.convert_to_abs_path(download_directory)
        self.s3_links_file = args.txt
        self.inline_s3_links = args.paths
        self.package_id = args.package
        self.data_structure = args.datastructure
        self.quiet = args.quiet
        self.thread_num = args.workerThreads if args.workerThreads else max([1, multiprocessing.cpu_count() - 1])
        self.regex_file_filter = args.file_regex
        if self.s3_links_file:
            self.download_mode = 'text'
        elif self.data_structure:
            self.download_mode = 'datastructure'
        elif self.inline_s3_links:
            self.download_mode = 'paths'
        else:
            self.download_mode = 'package'
        self.verify_flg = args.verify

        if not self.verify_flg and not args.workerThreads:
            self.verbose_print()
            self.verbose_print('No value specified for --workerThreads. Using the default option of {}'.format(
                self.thread_num))
            self.verbose_print(
                'Important - You can configure the thread count setting using the --workerThreads argument to maximize your download speed.')
            self.verbose_print()
        # for copying files directly to another s3 bucket
        self.custom_user_s3_endpoint = args.s3_destination

        # non-configurable default instance variables
        self.download_queue = Queue()
        self.local_file_names = {}  # map of package-file-id to alias
        self.package_file_download_errors = set()
        # self.package_file_download_errors needs a lock if multiple threads will be adding to it simultaneously
        self.package_file_download_errors_lock = threading.Lock()

        self.download_job_uuid = None

        self.download_job_manifest_column_defs = {
            'uuid': self.download_job_uuid,
            'run_date': time.strftime("%Y%m%dT%H%M%S"),
            'package_id': self.package_id,
            'download_directory': self.package_download_directory,
            's3_destination': self.custom_user_s3_endpoint,
            'data_structure': self.data_structure,
            's3_links_file': self.s3_links_file,
            'regex': self.regex_file_filter
        }
        self.download_job_progress_report_column_defs = {
            'package_file_id': None,
            'package_file_expected_location': None,
            'nda_s3_url': None,
            'exists': False,
            'expected_file_size': None,
            'actual_file_size': 0,
            'e_tag': None,
            'download_complete_time': None
        }
        self.download_progress_report_file_path = self.initialize_verification_files()
Esempio n. 7
0
    def verify_download(self):

        if self.custom_user_s3_endpoint:
            raise Exception(
                'The --verify command does not yet support checking for files in s3 endpoints. This feature will be added in a future iteration...')
            exit_client()

        verification_report_path = os.path.join(self.downloadcmd_package_metadata_directory, 'download-verification-report.csv')
        err_mess_template = 'Cannot start verification process - {} already exists \nYou must move or rename the file in order to continue'
        if os.path.exists(verification_report_path):
            print()
            print(err_mess_template.format(verification_report_path))
            exit_client()

        fpath = os.path.join(self.downloadcmd_package_metadata_directory, 'download-verification-retry-s3-links.csv')
        if os.path.exists(fpath):
            print(err_mess_template.format(fpath))
            exit_client()

        def get_download_progress_report_path():
            return os.path.join(self.downloadcmd_package_metadata_directory, '.download-progress',
                                self.download_job_uuid, 'download-progress-report.csv')

        def parse_download_progress_report_for_files(download_progress_report_path):
            files = []
            if os.path.exists(download_progress_report_path):
                with open(download_progress_report_path, newline='') as csvfile:
                    file_reader = csv.DictReader(csvfile)
                    files = [f for f in file_reader]
            return files

        def get_complete_file_list():
            if self.download_mode in ['text', 'datastructure']:
                return set(self.local_file_names)
            elif self.download_mode == 'package':
                print('Getting list of all files in package. If your package is large, this may take some time')
                page = 1
                batch_size = 1000
                all_results = []
                while True:
                    results = self.get_package_files_by_page(page, batch_size)
                    aliases = {r['package_file_id']: r for r in results}
                    self.local_file_names.update(aliases)
                    if not results:
                        break
                    else:
                        all_results.append(results)
                        print('Retrieved {} 1000 files. At file #{}'.format('first' if page==1 else 'next', ((page - 1) * batch_size) + 1))
                    page += 1

                return set(self.local_file_names)
            else:
                raise Exception('Unsupported download mode: {}'.format(self.download_mode))

        def create_download_verification_retry_links_file(s3_links):
            fpath = os.path.join(self.downloadcmd_package_metadata_directory, 'download-verification-retry-s3-links.csv')
            with open(fpath, 'w') as retry_file:
                for link in s3_links:
                    retry_file.write(link+'\n')

        def add_files_to_report(download_progress_report_path, verification_report_path, probably_missing_files_list):
            copyfile(download_progress_report_path, verification_report_path)

            all_records = []
            for file_id in probably_missing_files_list:
                file_info = self.local_file_names[file_id]
                record = copy.deepcopy(self.download_job_progress_report_column_defs)
                # TODO - consider making these the same names
                record['package_file_expected_location'] = file_info['download_alias']
                record['expected_file_size'] = file_info['file_size']
                record['package_file_id'] = file_id
                download_path = os.path.join(self.package_download_directory,
                                             self.local_file_names[file_id]['download_alias'])
                if os.path.exists(download_path):
                    record['exists'] = True
                    stat = os.stat(download_path)
                    record['actual_file_size'] = stat.st_size
                    if record['actual_file_size'] == record['expected_file_size']:
                        record['download_complete_time'] = time.strftime("%Y%m%dT%H%M%S", time.localtime(stat.st_ctime))

                all_records.append(record)

            # batch requests to reduce overhead
            # TODO - add s3 location to file resource in order to eliminate this step?
            print('Retrieving s3 url information for {} nda file records'.format(len(all_records)))
            batch_size = 1000
            batches = [all_records[i:i + batch_size] for i in range(0, len(all_records), batch_size)]
            for batch in batches:
                result = self.get_presigned_urls([record['package_file_id'] for record in batch])
                # result is a dictionary of package-file-id to presignedUrl
                for record in batch:
                    ps_url = result[record['package_file_id']]
                    dest_bucket, dest_path = Utils.deconstruct_s3_url(ps_url)
                    record['nda_s3_url'] = 's3://{}/{}'.format(dest_bucket, dest_path)

            with open(verification_report_path, 'a', newline='') as verification_report:
                download_progress_report_writer = csv.DictWriter(verification_report,
                                                                 fieldnames=self.download_job_progress_report_column_defs)
                download_progress_report_writer.writerows(all_records)

            return [record['nda_s3_url'] for record in all_records if
                    record['actual_file_size'] < record['expected_file_size']]

        print()
        print('Running verification process. This process will check whether all of the files from the following downloadcmd were successfully downloaded to the computer:')

        verification_report_path = os.path.join(self.downloadcmd_package_metadata_directory, 'download-verification-report.csv')

        print('{}'.format(self.build_rerun_download_cmd(['--verify'])))
        print()
        pr_path = get_download_progress_report_path()
        print('Getting expected file list for download...')
        complete_file_set = get_complete_file_list()

        # Sometimes there are dupes in the qft table. eliminate to get accurate file count
        accurate_file_ct = len(set(map(lambda x: x['download_alias'],self.local_file_names.values())))
        file_sz = Utils.human_size(sum(map(lambda x: x['file_size'], self.local_file_names.values())))

        print('{} files are expected to have been downloaded from the command above, totaling {}'.format(accurate_file_ct, file_sz))
        print()
        print('Parsing program system logs for history of completed downloads...')
        print('Important - if you think files may have been deleted from your system after the download was run, you should remove the system log at {}'
              ' and re-run the --verify command. This will force the program to check for these files instead of assuming they exist based on system log entries. This will cause the --verify step to take longer'
              ' to finish but will be necessary for accurate results.'.format(pr_path))
        downloaded_file_records = parse_download_progress_report_for_files(pr_path)
        # There shouldn't be duplicates in the system logs, but check anyway
        downloaded_file_records_count = len({f['package_file_expected_location'] for f in downloaded_file_records})
        downloaded_file_set = {int(f['package_file_id']) for f in downloaded_file_records}
        print()
        print('Found {} complete file downloads according to log file {}'.format(downloaded_file_records_count, pr_path))
        print()
        probably_missing_files = complete_file_set - downloaded_file_set
        print('Checking {} for all files which were not found in the program system logs. Detailed report will be created at {}...'
              .format( self.package_download_directory, verification_report_path))
        undownloaded_s3_links = add_files_to_report(pr_path, verification_report_path, probably_missing_files)
        print()
        if undownloaded_s3_links:
            print('Finished verification process and file check. Found {} files that were missing or whose size on disk were less than expected'.format(len(undownloaded_s3_links)))
            print()
            print('Generating list of s3 links for all missing/incomplete files...'.format(len(undownloaded_s3_links)))
            create_download_verification_retry_links_file(undownloaded_s3_links)
            incomplete_s3_fp = os.path.join(self.downloadcmd_package_metadata_directory, 'download-verification-retry-s3-links.csv')
            print(
                'Finished creating {} file. \nThis file contains s3-links for all files that were found to be missing or incomplete. You may '
                'download these files by running:\n'
                '   {} -t {}'.format(incomplete_s3_fp, self.build_rerun_download_cmd(['--verify','--text', '--datastructure']), incomplete_s3_fp ))
        else:
            print('Finished verification process and file check. No missing files found. All files match expected size. Download 100% complete.')

        print()
        print('Details about status of files in download can be found at {} (This file can be opened with Excel or Google Spreadsheets)'.format(verification_report_path))
        exit_client()
Esempio n. 8
0
 def print_upload_part_info(bytes):
     print('Transferred {} for {}'.format(Utils.human_size(bytes), return_value['nda_s3_url']))
Esempio n. 9
0
    def download_from_s3link(self, package_file_id, err_if_exists=False, failed_s3_links_file=None):

        # use this instead of exists_ok in order to work with python v.2
        def mk_dir_ignore_err(dir):
            try:
                os.makedirs(os.path.normpath(dir))
            except FileExistsError as e:
                pass
            except OSError as e:
                # Raise exception for any errors other than FileExists error
                # Using OSError for version compatibility
                if e.errno != 17:
                    raise
                pass

        # declare to avoid 'reference before declared' error
        s3_link = source_uri = None
        bytes_written = 0
        return_value = copy.deepcopy(self.download_job_progress_report_column_defs)
        return_value['package_file_id'] = str(package_file_id)
        try:
            alias = self.local_file_names[package_file_id]['download_alias']
            return_value['expected_file_size'] = self.local_file_names[package_file_id]['file_size']
            return_value['package_file_expected_location'] = alias
            completed_download = os.path.normpath(os.path.join(self.package_download_directory, alias))
            partial_download = os.path.normpath(
                os.path.join(self.package_download_directory, alias + '.partial'))
            downloaded = False
            resume_header = None

            if not self.custom_user_s3_endpoint:
                if os.path.isfile(completed_download):
                    if err_if_exists:
                        msg = "File {} already exists. Move or rename the file before re-running the command to continue".format(
                            completed_download)
                        print(msg)
                        print('Exiting...')
                        sys.exit(1)

                    self.verbose_print('Skipping download (already exists): {}'.format(completed_download))
                    return return_value

                if os.path.isfile(partial_download):
                    downloaded = True
                    downloaded_size = os.path.getsize(partial_download)
                    resume_header = {'Range': 'bytes={}-'.format(downloaded_size)}
                    self.verbose_print('Resuming download: {}'.
                                       format(partial_download))
                else:
                    mk_dir_ignore_err(os.path.dirname(partial_download))
                    self.verbose_print('Starting download: {}'.format(partial_download))

            if self.custom_user_s3_endpoint:
                # downloading directly to s3 bucket
                # get cred for file
                response = self.get_temp_creds_for_file(package_file_id, self.custom_user_s3_endpoint)
                ak = response['access_key']
                sk = response['secret_key']
                sess_token = response['session_token']
                source_uri = response['source_uri']
                dest_uri = response['destination_uri']

                dest_bucket, dest_path = Utils.deconstruct_s3_url(dest_uri)
                src_bucket, src_path = Utils.deconstruct_s3_url(source_uri)

                self.verbose_print('Starting download: s3://{}/{}'.format(dest_bucket, dest_path))

                # boto3 copy
                sess = boto3.session.Session(aws_access_key_id=ak,
                                             aws_secret_access_key=sk,
                                             aws_session_token=sess_token,
                                             region_name='us-east-1')

                s3_client = sess.client('s3')
                response = s3_client.head_object(Bucket=src_bucket, Key=src_path)
                return_value['actual_file_size'] = response['ContentLength']
                return_value['e_tag'] = response['ETag'].replace('"', '')
                return_value['nda_s3_url'] = 's3://{}/{}'.format(src_bucket, src_path)

                s3 = sess.resource('s3')
                copy_source = {
                    'Bucket': src_bucket,
                    'Key': src_path
                }

                def print_upload_part_info(bytes):
                    print('Transferred {} for {}'.format(Utils.human_size(bytes), return_value['nda_s3_url']))

                KB = 1024
                MB = KB * KB
                GB = KB**3
                LARGE_OBJECT_THRESHOLD = 5 * GB
                args = {
                    'ExtraArgs' : {'ACL': 'bucket-owner-full-control'}
                }

                if int(return_value['actual_file_size']) >= LARGE_OBJECT_THRESHOLD:
                    print('Transferring large object {} ({}) in multiple parts'
                          .format(return_value['nda_s3_url'], Utils.human_size(int(return_value['actual_file_size']))))
                    config = TransferConfig(multipart_threshold=LARGE_OBJECT_THRESHOLD, multipart_chunksize=1 * GB)
                    args['Config'] = config
                    args['Callback'] = print_upload_part_info

                s3.meta.client.copy(copy_source,
                                    dest_bucket,
                                    dest_path,
                                    **args)

            else:
                # downloading to local machine
                s3_link = self.get_presigned_urls([package_file_id])
                with requests.session() as s:
                    s.mount(s3_link,HTTPAdapter(max_retries=10))
                    if resume_header:
                        s.headers.update(resume_header)
                    with open(partial_download, "ab" if downloaded else "wb") as download_file:
                        with s.get(s3_link, stream=True) as response:
                            response.raise_for_status()
                            for chunk in response.iter_content(chunk_size=1024 * 1024 * 5): # iterate 5MB chunks
                                if chunk:
                                    bytes_written += download_file.write(chunk)
                os.rename(partial_download, completed_download)
                self.verbose_print('Completed download {}'.format(completed_download))
                return_value['actual_file_size'] = bytes_written
                bucket, key = Utils.deconstruct_s3_url(s3_link)
                return_value['nda_s3_url'] = 's3://{}/{}'.format(bucket, key)
            return_value['exists'] = True
            return_value['download_complete_time'] = time.strftime("%Y%m%dT%H%M%S")
            return return_value

        except Exception as e:
            if not s3_link and not source_uri:
                # we couldnt get credentials, which means the service has become un-responsive.
                # Instruct the user to retry at another time
                print()
                print(
                    'Unexpected Error During File Download - Service Unresponsive. Unable to obtain credentials for file-id {}'.format(
                        package_file_id))
                print('Please re-try downloading files at a later time. ')
                print('You may contact [email protected] for assistance in resolving this error.')
                # use os._exit to kill the whole program. This works even if this is called in a child thread, unlike sys.exit()
                os._exit(1)

            self.write_to_failed_download_link_file(failed_s3_links_file, s3_link=s3_link, source_uri=source_uri)

            error_code = -1 if not isinstance(e, HTTPError) else int(e.response.status_code)
            if error_code == 404:
                message = 'This path is incorrect: {}. Please try again.'.format(s3_link)
                self.verbose_print(message)
            elif error_code == 403:
                message = '\nThis is a private bucket. Please contact NDAR for help: {}'.format(s3_link)
                self.verbose_print(message)
            else:
                self.verbose_print(str(e))
                self.verbose_print(get_traceback())
                if 'operation: Access Denied' in str(e):
                    print()
                    print(
                        'This error is likely caused by a misconfiguration on the target s3 bucket')
                    print(
                        "For more information about how to correctly configure the target bucket, run 'downloadcmd -h' and read the description of the s3 argument")
                    print()
                    time.sleep(2)

            # if source_uri is set, it means they're downloading to s3 bucket and there will not be any partial file
            if bytes_written == 0 and not source_uri:
                try:
                    os.remove(partial_download)
                except:
                    self.verbose_print('error removing partial file {}'.format(partial_download))
            return return_value
Esempio n. 10
0
    def start(self):
        print()
        print('Getting Package Information...')
        package_resource = self.get_package_info()
        print()
        print('Package-id: {}'.format(self.package_id))
        print('Name: {}'.format(package_resource['description']))
        print('Has associated files?: {}'.format('Yes' if package_resource['has_associated_files'] else 'No'))
        # Dont print this out because at the moment, the number coming back from the service includes duplicates.
        # uncomment when that is fixed
        print ('Number of files in package: {}'.format(package_resource['file_count']))
        print('Total Package Size: {}'.format(Utils.human_size(package_resource['total_package_size'])))
        print()

        files = []
        if self.download_mode == 'datastructure':
            if not self.verify_flg:
                self.verbose_print('Downloading S3 links from data structure: {}'.format(self.data_structure))
            if not package_resource['has_associated_files']:
                print(''''No Associated files detected in this package. In order to download associated files, you must create a new package
        on the NDA website and make sure that you check the option to "Include associated files"''')
                exit_client()
            files = self.use_data_structure()
        elif self.download_mode == 'text':
            if not self.verify_flg:
                self.verbose_print('Downloading S3 links from text file: {}'.format(self.s3_links_file))
            files = self.use_s3_links_file()
        elif self.download_mode == 'package':
            if not self.verify_flg:
                if self.regex_file_filter:
                    self.verbose_print('Downloading files from package {} matching regex {}'.format(self.package_id, self.regex_file_filter))
                else:
                    self.verbose_print('Downloading all files from package with id: {}'.format(self.package_id))
        else:
            files = self.query_files_by_s3_path(self.inline_s3_links)

        if files:
            self.local_file_names = {int(f['package_file_id']): f for f in files}
        print()

        success_files = set()
        download_request_count = 0
        download_start_date = datetime.datetime.now()

        download_progress_report = open(self.download_progress_report_file_path, 'a', newline='')
        download_progress_report_writer = csv.DictWriter(download_progress_report,
                                                         fieldnames=self.download_job_progress_report_column_defs)

        failed_s3_links_file = open(os.path.join(NDATools.NDA_TOOLS_DOWNLOADCMD_LOGS_FOLDER,
                                                 'failed_s3_links_file_{}.txt'.format(time.strftime("%Y%m%dT%H%M%S"))),
                                    'a')

        message = 'S3 links for files that failed to download will be written out to {}. You can attempt to download these files later by running: ' \
            .format(failed_s3_links_file.name)
        message += '\n\t{} -t "{}"'\
            .format(self.build_rerun_download_cmd(['--text','--datastructure']), failed_s3_links_file.name)
        print(message)
        print()
        time.sleep(1.5)

        if self.download_mode == 'package':
            file_ct = package_resource['file_count']
            file_sz = Utils.human_size(int(package_resource['total_package_size']))
        else:
            file_ct = len(self.local_file_names.keys())
            file_sz = Utils.human_size(sum(map(lambda x: x['file_size'], self.local_file_names.values())))

        if self.download_mode == 'package' and self.regex_file_filter:
            # cant display file number because its not known
            message = 'Beginning download of files from package matching {} using {} threads'.format(
                self.regex_file_filter,
                self.thread_num)
        else:
            message = 'Beginning download of {} files ({}) to {} using {} threads'.format(
                file_ct,
                file_sz,
                self.custom_user_s3_endpoint or self.package_download_directory,
                self.thread_num)

        print()
        print(message)
        time.sleep(5)

        # These are all arrays just so that the print_download_progress_report method can update the variables inside them
        trailing_50_file_bytes = []
        trailing_50_timestamp = [datetime.datetime.now()]

        def write_to_download_progress_report_file(download_record):
            # if file-size =0, there could have been an error. Dont add to file
            if download_record['actual_file_size'] > 0:
                download_progress_report_writer.writerow(download_record)

        def print_download_progress_report(num_downloaded):
            self.verbose_print()
            byte_total = sum(trailing_50_file_bytes)
            download_progress_message = 'Download Progress Report [{}]: \n    {}/{} queued files downloaded so far. ' \
                .format(datetime.datetime.now().strftime('%b %d %Y %H:%M:%S'), num_downloaded, download_request_count)
            download_progress_message += '\n    Last 50 files contained ~ {} bytes and finished in {} (Hours:Minutes:Seconds). ' \
                .format(Utils.human_size(byte_total), str(datetime.datetime.now() - trailing_50_timestamp[0]).split('.')[0])

            seconds_last_50_files = (datetime.datetime.now() - trailing_50_timestamp[0]).seconds
            if seconds_last_50_files == 0:
                seconds_last_50_files = 1  # avoid a 'division by 0' error

            # convert download speed to bits per second
            avg_speed_bps = Utils.human_size((8 * byte_total) // seconds_last_50_files)
            if avg_speed_bps[-1:] == 'B':
                avg_speed_bps = avg_speed_bps.replace('B','bps')
            else:
                avg_speed_bps = avg_speed_bps.replace('bytes','bps')

            download_progress_message += '\n    Avg download rate (in bits per second) for the last 50 files is ~ {}.' \
                .format(avg_speed_bps)

            download_progress_message += '\n    Download has been in progress for {} (Hours:Minutes:Seconds).\n' \
                .format(str(datetime.datetime.now() - download_start_date).split('.')[0])

            self.verbose_print(download_progress_message)
            trailing_50_file_bytes.clear()
            trailing_50_timestamp[0] = datetime.datetime.now()
            self.verbose_print()

        def download(package_file_id):
            # check if  these exist, and if not, get and set:
            download_record = self.download_from_s3link(package_file_id,
                                                        failed_s3_links_file=failed_s3_links_file)
            trailing_50_file_bytes.append(download_record['actual_file_size'])
            success_files.add(package_file_id)
            num_downloaded = len(success_files)

            if num_downloaded % 50 == 0:
                print_download_progress_report(num_downloaded)

            download_progress_file_writer_pool.add_task(write_to_download_progress_report_file, download_record)

        download_pool = ThreadPool(self.thread_num)
        download_progress_file_writer_pool = ThreadPool(1, 1000)

        for package_file_id_list in self.generate_download_batch_file_ids():
            additional_file_ct = len(package_file_id_list)
            download_request_count += additional_file_ct
            self.verbose_print('Adding {} files to download queue. Queue contains {} files\n'.format(additional_file_ct,
                                                                                                   download_request_count))
            download_pool.map(download, package_file_id_list)

        download_pool.wait_completion()
        failed_s3_links_file.close()
        download_progress_report.close()

        # dont generate a file if there were no failures
        if not self.package_file_download_errors:
            print('No failures detected. Removing file {}'.format(failed_s3_links_file.name))
            os.remove(failed_s3_links_file.name)

        print()

        print('Finished processing all download requests @ {}.'.format(datetime.datetime.now()))
        print('     Total download requests {}'
              .format(download_request_count))
        print('     Total errors encountered: {}'.format(len(self.package_file_download_errors)))

        print()
        print(' Exiting Program...')