def test_recursive_rsync(self, tmpdir, volume_ids, pairtree_volume_paths): utils.download_file(htids=volume_ids, outdir=tmpdir.dirname, keep_dirs=True, format='pairtree') for path in pairtree_volume_paths: assert os.path.exists(os.path.join(tmpdir.dirname, path))
def test_rsync_single_file(self, tmpdir, volume_ids, pairtree_volume_paths): expected_fname = os.path.split(pairtree_volume_paths[0])[1] utils.download_file(htids=volume_ids[0], outdir=tmpdir.dirname, format='pairtree') assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
def test_rsync_multi_file(self, tmpdir, volume_ids, pairtree_volume_paths): utils.download_file(htids=volume_ids, outdir=tmpdir.dirname, format='pairtree') for path in pairtree_volume_paths: expected_fname = os.path.split(path)[1] assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
def download(df_path, out_dir): """Download HTRC features vols. """ df = pd.read_json(df_path) for htids in tqdm(chunked(list(df.htid), 100)): try: download_file(htids, outdir=out_dir) except Exception as e: print(e)
def download_vols(ids, output_dir=None): # If no explicit output directory is specified, just create a temporary one if output_dir is None: output_dir = tempfile.mkdtemp() # Download extracted features download_file(htids=ids, outdir=output_dir) paths = map(lambda x: '{}/{}.json.bz2'.format(output_dir, x), ids) paths = [p for p in paths if os.path.exists(p)] return paths
def download_vols(ids, output_dir=None): # If no explicit output directory is specified, just create a temporary one if output_dir is None: output_dir = tempfile.mkdtemp() if 'volume_id' in ids: ids.remove('volume_id') # Download extracted features paths = {id: '{}/{}.json.bz2'.format(output_dir, id) for id in ids} try: download_file(htids=ids, outdir=output_dir) except subprocess.CalledProcessError: missing = [id for id, p in paths.items() if not os.path.exists(p)] with open('error_missing.log', 'w') as outfile: outfile.write('\n'.join(missing)) print("{} volume{} failed to download. " "See `error_missing.log`.".format(len(missing), 's' if len(missing) > 1 else '')) print("Continuing with volumes that succesfully downloaded...") paths = [p for id, p in paths.items() if os.path.exists(p)] return paths
def download_vols(ids, output_dir=None): # If no explicit output directory is specified, just create a temporary one if output_dir is None: output_dir = tempfile.mkdtemp() if 'volume_id' in ids: ids.remove('volume_id') # Download extracted features paths = {id: '{}/{}.json.bz2'.format(output_dir, id) for id in ids} try: download_file(htids=ids, outdir=output_dir) except subprocess.CalledProcessError: missing = [id for id, p in paths.items() if not os.path.exists(p)] with open('error_missing.log', 'w') as outfile: outfile.write('\n'.join(missing)) print("{} volume{} failed to download. " "See `error_missing.log`.".format( len(missing), 's' if len(missing) > 1 else '')) print("Continuing with volumes that succesfully downloaded...") paths = [p for id, p in paths.items() if os.path.exists(p)] return paths
def test_rsync_multi_file(self, tmpdir, volume_ids, volume_paths): utils.download_file(htids=volume_ids, outdir=tmpdir.dirname) for path in volume_paths: expected_fname = os.path.split(path)[1] assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
def test_rsync_single_file(self, tmpdir, volume_ids, volume_paths): expected_fname = os.path.split(volume_paths[0])[1] utils.download_file(htids=volume_ids[0], outdir=tmpdir.dirname) assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
def test_recursive_rsync(self, tmpdir, volume_ids, volume_paths): utils.download_file(htids=volume_ids, outdir=tmpdir.dirname, keep_dirs=True) for path in volume_paths: assert os.path.exists(os.path.join(tmpdir.dirname, path))