def ds_sync(args): dataset_created = False if args.parents or (args.project and args.name): args.id = ds_create(args) dataset_created = True print('Syncing dataset id {} to local folder {}'.format(args.id, args.folder)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) removed, added = ds.sync_folder( local_path=args.folder, dataset_path=args.dataset_folder or None, verbose=args.verbose) print('Sync completed: {} files removed, {} added / modified'.format(removed, added)) if not args.skip_close: if dataset_created and not removed and not added: print('Zero modifications on local copy, reverting dataset creation.') Dataset.delete(ds.id, force=True) return 0 print("Finalizing dataset") if ds.is_dirty(): # upload the files print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage())) ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None) ds.finalize() print('Dataset closed and finalized') clear_state() return 0
def ds_delete(args): print('Deleting dataset id {}'.format(args.id)) check_null_id(args) print_args(args) Dataset.delete(dataset_id=args.id) print('Dataset {} deleted'.format(args.id)) clear_state() return 0
def ds_upload(args): print('uploading local files to dataset id={}'.format(args.id)) print_args(args) ds = Dataset.get(dataset_id=args.id) ds.upload(verbose=args.verbose, output_url=args.storage or None) print('Dataset upload completed') return 0
def ds_close(args): print('Finalizing dataset id={}'.format(args.id)) print_args(args) ds = Dataset.get(dataset_id=args.id) ds.finalize() print('Dataset closed and finalized') return 0
def ds_list(args): print('List dataset content: {}'.format(args.id or (args.project, args.name))) print_args(args) ds = Dataset.get(dataset_id=args.id or None, dataset_project=args.project or None, dataset_name=args.name or None) print('Listing dataset content') formatting = '{:64} | {:10,} | {:64}' print(formatting.replace(',', '').format('file name', 'size', 'hash')) print('-' * len(formatting.replace(',', '').format('-', '-', '-'))) filters = args.filter if args.filter else [None] file_entries = ds.file_entries_dict num_files = 0 total_size = 0 for mask in filters: files = ds.list_files(dataset_path=mask, dataset_id=ds.id if args.modified else None) num_files += len(files) for f in files: e = file_entries[f] print(formatting.format(e.relative_path, e.size, e.hash)) total_size += e.size print('Total {} files, {} bytes'.format(num_files, total_size)) return 0
def ds_create(args): print('Creating a new dataset:') print_args(args) ds = Dataset.create(dataset_project=args.project, dataset_name=args.name, parent_datasets=args.parents) if args.tags: ds.tags = ds.tags + args.tags print('New dataset created id={}'.format(ds.id)) clear_state({'id': ds.id}) return ds.id
def ds_squash(args): print('Squashing datasets ids={} into target dataset named \'{}\''.format( args.ids, args.name)) print_args(args) ds = Dataset.squash(dataset_name=args.name, dataset_ids=args.ids, output_url=args.storage or None) print('Squashing completed, new dataset created id={}'.format(ds.id)) return 0
def ds_upload(args): print('uploading local files to dataset id {}'.format(args.id)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) ds.upload(verbose=args.verbose, output_url=args.storage or None, chunk_size=args.chunk_size or -1) print('Dataset upload completed') return 0
def ds_remove_files(args): print('Removing files/folder from dataset id={}'.format(args.id)) print_args(args) ds = Dataset.get(dataset_id=args.id) num_files = 0 for file in args.files: num_files += ds.remove_files(dataset_path=file, recursive=not args.non_recursive, verbose=args.verbose) print('{} files removed'.format(num_files)) return 0
def ds_verify(args): print('Verify dataset id {}'.format(args.id)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) files_error = ds.verify_dataset_hash( local_copy_path=args.folder or None, skip_hash=args.filesize, verbose=args.verbose) if files_error: print('Dataset verification completed, {} errors found!'.format(len(files_error))) else: print('Dataset verification completed successfully, no errors found.')
def ds_add(args): print('Adding files/folder to dataset id={}'.format(args.id)) print_args(args) ds = Dataset.get(dataset_id=args.id) num_files = 0 for file in args.files: num_files += ds.add_files(path=file, recursive=not args.non_recursive, verbose=args.verbose, dataset_path=args.dataset_folder or None) print('{} files added'.format(num_files)) return 0
def ds_sync(args): print('Syncing dataset id={} to local folder {}'.format( args.id, args.folder)) print_args(args) ds = Dataset.get(dataset_id=args.id) removed, added = ds.sync_folder(local_path=args.folder, dataset_path=args.dataset_folder or None, verbose=args.verbose) print('Sync completed: {} files removed, {} added / modified'.format( removed, added)) return 0
def ds_publish(args): print('Publishing dataset id {}'.format(args.id)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) if not ds.is_final(): raise ValueError("Cannot publish dataset. Please finalize it first, run `clearml-data close`") ds.publish() print('Dataset published') clear_state() # just to verify the state is clear return 0
def ds_search(args): print('Search datasets') print_args(args) datasets = Dataset.list_datasets( dataset_project=args.project or None, partial_name=args.name or None, tags=args.tags or None, ids=args.ids or None ) formatting = '{:16} | {:32} | {:19} | {:19} | {:32}' print(formatting.format('project', 'name', 'tags', 'created', 'id')) print('-' * len(formatting.format('-', '-', '-', '-', '-'))) for d in datasets: print(formatting.format( d['project'], d['name'], str(d['tags'] or [])[1:-1], str(d['created']).split('.')[0], d['id'])) return 0
def ds_close(args): print('Finalizing dataset id {}'.format(args.id)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) if ds.is_dirty(): if args.disable_upload: raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`") # upload the files print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage())) ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None) ds.finalize() print('Dataset closed and finalized') clear_state() return 0
def ds_compare(args): print('Comparing target dataset id {} with source dataset id {}'.format(args.target, args.source)) print_args(args) ds = Dataset.get(dataset_id=args.target) removed_files = ds.list_removed_files(dataset_id=args.source) modified_files = ds.list_modified_files(dataset_id=args.source) added_files = ds.list_added_files(dataset_id=args.source) if args.verbose: print('Removed files:') print('\n'.join(removed_files)) print('\nModified files:') print('\n'.join(modified_files)) print('\nAdded files:') print('\n'.join(added_files)) print('') print('Comparison summary: {} files removed, {} files modified, {} files added'.format( len(removed_files), len(modified_files), len(added_files))) return 0
def ds_get(args): print('Download dataset id {}'.format(args.id)) check_null_id(args) print_args(args) ds = Dataset.get(dataset_id=args.id) if args.overwrite: if args.copy: # noinspection PyBroadException try: shutil.rmtree(args.copy) except Exception: pass Path(args.copy).mkdir(parents=True, exist_ok=True) elif args.link: # noinspection PyBroadException try: shutil.rmtree(args.link) except Exception: pass if args.copy: ds_folder = args.copy ds.get_mutable_local_copy(target_folder=ds_folder, part=args.part, num_parts=args.num_parts) else: if args.link: Path(args.link).mkdir(parents=True, exist_ok=True) # noinspection PyBroadException try: Path(args.link).rmdir() except Exception: try: Path(args.link).unlink() except Exception: raise ValueError( "Target directory {} is not empty. Use --overwrite.". format(args.link)) ds_folder = ds.get_local_copy(part=args.part, num_parts=args.num_parts) if args.link: os.symlink(ds_folder, args.link) ds_folder = args.link print('Dataset local copy available: {}'.format(ds_folder)) return 0
def ds_delete(args): print('Deleting dataset id={}'.format(args.id)) print_args(args) Dataset.delete(dataset_id=args.id) print('Dataset {} deleted'.format(args.id)) return 0