def test_download_glob_single_file(tempdir, azure): with setup_tree(azure): print("") remote_path = test_dir / 'data/single/single' / '*.txt' down = ADLDownloader(azure, remote_path, tempdir, run=False, overwrite=True) file_pair_dict = dict(down._file_pairs) assert len(file_pair_dict) == 1 lfiles = [os.path.relpath(f, tempdir) for f in file_pair_dict.keys()] assert sorted(lfiles) == sorted( [os.path.join('single.txt.inprogress')]) remote_path = test_dir / 'data/*/single' / 'single.txt' down = ADLDownloader(azure, remote_path, tempdir, run=False, overwrite=True) file_pair_dict = dict(down._file_pairs) assert len(file_pair_dict) == 1 lfiles = [os.path.relpath(f, tempdir) for f in file_pair_dict.keys()] assert sorted(lfiles) == sorted( [os.path.join('single', 'single', 'single.txt.inprogress')])
def test_download_single_file(tempdir, azure): with azure_teardown(azure): name = posix(test_dir, 'remote.csv') lines = 100 fname = os.path.join(tempdir, 'local.csv') size, checksum = create_remote_csv(azure, name, 10, 5, lines) try: # single chunk down = ADLDownloader(azure, name, fname, 1, size + 10, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname) try: # multiple chunks, one thread down = ADLDownloader(azure, name, fname, 1, size // 5, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname)
def test_download_glob(tempdir, azure): with setup_tree(azure): remote_path = test_dir / 'data' / 'a' / '*.csv' down = ADLDownloader(azure, remote_path, tempdir, run=False, overwrite=True) file_pair_dict = dict(down._file_pairs) assert len(file_pair_dict.keys()) == 2 lfiles = [os.path.relpath(f, tempdir) for f in file_pair_dict.keys()] assert sorted(lfiles) == sorted( ['x.csv.inprogress', 'y.csv.inprogress']) remote_path = test_dir / 'data' / '*' / '*.csv' down = ADLDownloader(azure, remote_path, tempdir, run=False, overwrite=True) file_pair_dict = dict(down._file_pairs) assert len(file_pair_dict.keys()) == 4 lfiles = [os.path.relpath(f, tempdir) for f in file_pair_dict.keys()] assert sorted(lfiles) == sorted([ os.path.join('a', 'x.csv.inprogress'), os.path.join('a', 'y.csv.inprogress'), os.path.join('b', 'x.csv.inprogress'), os.path.join('b', 'y.csv.inprogress') ]) remote_path = test_dir / 'data' / '*' / 'z.txt' down = ADLDownloader(azure, remote_path, tempdir, run=False, overwrite=True) file_pair_dict = dict(down._file_pairs) assert len(file_pair_dict.keys()) == 2 lfiles = [os.path.relpath(f, tempdir) for f in file_pair_dict.keys()] assert sorted(lfiles) == sorted([ os.path.join('a', 'z.txt.inprogress'), os.path.join('b', 'z.txt.inprogress') ])
def test_download_many(tempdir, azure): with setup_tree(azure): down = ADLDownloader(azure, test_dir, tempdir, 1, 2**24, overwrite=True) nfiles = 0 for dirpath, dirnames, filenames in os.walk(tempdir): nfiles += len(filenames) assert nfiles > 1
def test_download_overwrite(tempdir, azure): with setup_tree(azure): with open(os.path.join(tempdir, 'x.csv'), 'w') as f: f.write('12345') with pytest.raises(OSError) as e: ADLDownloader(azure, test_dir, tempdir, 1, 2**24, run=False) assert tempdir in str(e)
def test_save_down(tempdir, azure): with setup_tree(azure): down = ADLDownloader(azure, test_dir, tempdir, 1, 2**24, run=False, overwrite=True) down.save() alldownloads = ADLDownloader.load() assert down.hash in alldownloads down.save(keep=False) alldownloads = ADLDownloader.load() assert down.hash not in alldownloads
def test_download_empty_directory(tempdir, azure): with setup_tree(azure): down = ADLDownloader(azure, test_dir, tempdir, 1, 2**24, overwrite=True) dirname = os.path.join(tempdir, 'data/empty') assert os.path.isdir(dirname)
def test_download_path(azure): with setup_tree(azure): down = ADLDownloader(azure, lpath="/lpath/test/testfolder", rpath='/' + test_dir.name, run=False) for lfile, rfile in down._file_pairs: if 'data' in lfile: lfile = AzureDLPath(lfile) assert lfile.as_posix().startswith( '/lpath/test/testfolder/data')
def download_from_adls(account_name, source_path, destination_path, thread_count=None, overwrite=False): client = cf_dls_filesystem(account_name) ADLDownloader(client, source_path, destination_path, thread_count, overwrite=overwrite)
def test_download_single_file_in_directory(tempdir, azure): with setup_tree(azure): down = ADLDownloader(azure, test_dir, tempdir, 1, 2**24, overwrite=True) dirname = os.path.join(tempdir, 'data/single/single') assert os.path.isdir(dirname) assert os.path.isfile(os.path.join(dirname, 'single.txt'))
def do_get(self, line): parser = argparse.ArgumentParser(prog="get", add_help=False) parser.add_argument('remote_path', type=str) parser.add_argument('local_path', type=str, nargs='?', default='.') parser.add_argument('-b', '--chunksize', type=int, default=2**28) parser.add_argument('-c', '--threads', type=int, default=None) parser.add_argument('-f', '--force', action='store_true') try: args = parser.parse_args(line.split()) except: pass ADLDownloader(self._fs, args.remote_path, args.local_path, nthreads=args.threads, chunksize=args.chunksize, overwrite=args.force)
def download_from_adls(cmd, account_name, source_path, destination_path, chunk_size, buffer_size, block_size, thread_count=None, overwrite=False, progress_callback=None): client = cf_dls_filesystem(cmd.cli_ctx, account_name) ADLDownloader( client, source_path, destination_path, thread_count, chunksize=chunk_size, buffersize=buffer_size, blocksize=block_size, overwrite=overwrite, progress_callback=progress_callback or get_update_progress(cmd.cli_ctx))
def test_download_single_to_dir(tempdir, azure): with azure_teardown(azure): name = posix(test_dir, 'remote.csv') lines = 100 size, checksum = create_remote_csv(azure, name, 10, 5, lines) fname = os.path.join(tempdir, 'remote.csv') try: down = ADLDownloader(azure, name, tempdir, 1, 2**24, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname)
def test_download_single_empty_file(tempdir, azure): with azure_teardown(azure): name = posix(test_dir, 'remote.csv') lines = 0 # the file should have no bytes in it size, checksum = create_remote_csv(azure, name, 10, 5, lines) fname = os.path.join(tempdir, 'local.csv') # single chunk try: down = ADLDownloader(azure, name, fname, 1, size + 10, overwrite=True) assert md5sum(fname) == checksum assert os.stat(fname).st_size == size assert linecount(fname) == lines finally: if os.path.isfile(fname): os.remove(fname)
def download_from_adls(account_name, source_path, destination_path, chunk_size, buffer_size, block_size, thread_count=None, overwrite=False): client = cf_dls_filesystem(account_name) ADLDownloader( client, source_path, destination_path, thread_count, chunksize=chunk_size, buffersize=buffer_size, blocksize=block_size, overwrite=overwrite)
def do_list_downloads(self, line): print(ADLDownloader.load())
def test_download_root_folder(azure, tempdir): with setup_tree(azure): rpath = AzureDLPath('/'/test_dir / 'data/single/single'/ 'single.txt') ADLDownloader(azure, rpath=rpath, lpath=tempdir) assert os.path.isfile(os.path.join(tempdir, 'single.txt'))
def do_resume_download(self, line): try: up = ADLDownloader.load()[line] up.run() except KeyError: print("No such download")
def test_download_overwrite(tempdir, azure): with setup_tree(azure): with pytest.raises(OSError) as e: ADLDownloader(azure, test_dir, tempdir, 1, 2**24, run=False) assert tempdir in str(e)
def do_clear_downloads(self, line): ADLDownloader.clear_saved()
# coding=<utf8> from azure.datalake.store import core, lib from azure.datalake.store.multithread import ADLDownloader import logging if __name__ == '__main__': # Please fill the values under quotes till ################################### section TENANT_ID = "" CLIENT_ID = "" CLIENT_SECRET = "" ACCOUNT_NAME = "targetadlssandbox" PATH_TO_DOWNLOAD = "/data_files/TGT_NSN_201904241016.zip" LOG_FILE_NAME = "adls.log" ##################################################################################### #Authenticate and initialize adls filesystem object token = lib.auth(tenant_id=TENANT_ID, client_id=CLIENT_ID, client_secret=CLIENT_SECRET) adlfs = core.AzureDLFileSystem(token, store_name=ACCOUNT_NAME) # Add name of account # Set logging to debug adls_log_handler = logging.FileHandler(filename=LOG_FILE_NAME) adls_logger = logging.getLogger('azure.datalake.store') adls_logger.setLevel(logging.DEBUG) adls_logger.addHandler(adls_log_handler) # Print file info to check file size etc. print(adlfs.info(path=PATH_TO_DOWNLOAD)) # Download the file to current directory ADLDownloader(adlfs, PATH_TO_DOWNLOAD, "./")