Exemple #1
0
def ds_sync(args):
    dataset_created = False
    if args.parents or (args.project and args.name):
        args.id = ds_create(args)
        dataset_created = True

    print('Syncing dataset id {} to local folder {}'.format(args.id, args.folder))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    removed, added = ds.sync_folder(
        local_path=args.folder, dataset_path=args.dataset_folder or None, verbose=args.verbose)

    print('Sync completed: {} files removed, {} added / modified'.format(removed, added))

    if not args.skip_close:
        if dataset_created and not removed and not added:
            print('Zero modifications on local copy, reverting dataset creation.')
            Dataset.delete(ds.id, force=True)
            return 0

        print("Finalizing dataset")
        if ds.is_dirty():
            # upload the files
            print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
            ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)

        ds.finalize()
        print('Dataset closed and finalized')
        clear_state()

    return 0
Exemple #2
0
def ds_delete(args):
    print('Deleting dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    Dataset.delete(dataset_id=args.id)
    print('Dataset {} deleted'.format(args.id))
    clear_state()
    return 0
Exemple #3
0
def ds_upload(args):
    print('uploading local files to dataset id={}'.format(args.id))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    ds.upload(verbose=args.verbose, output_url=args.storage or None)
    print('Dataset upload completed')
    return 0
Exemple #4
0
def ds_close(args):
    print('Finalizing dataset id={}'.format(args.id))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    ds.finalize()
    print('Dataset closed and finalized')
    return 0
Exemple #5
0
def ds_list(args):
    print('List dataset content: {}'.format(args.id
                                            or (args.project, args.name)))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id or None,
                     dataset_project=args.project or None,
                     dataset_name=args.name or None)
    print('Listing dataset content')
    formatting = '{:64} | {:10,} | {:64}'
    print(formatting.replace(',', '').format('file name', 'size', 'hash'))
    print('-' * len(formatting.replace(',', '').format('-', '-', '-')))
    filters = args.filter if args.filter else [None]
    file_entries = ds.file_entries_dict
    num_files = 0
    total_size = 0
    for mask in filters:
        files = ds.list_files(dataset_path=mask,
                              dataset_id=ds.id if args.modified else None)
        num_files += len(files)
        for f in files:
            e = file_entries[f]
            print(formatting.format(e.relative_path, e.size, e.hash))
            total_size += e.size

    print('Total {} files, {} bytes'.format(num_files, total_size))
    return 0
Exemple #6
0
def ds_create(args):
    print('Creating a new dataset:')
    print_args(args)
    ds = Dataset.create(dataset_project=args.project, dataset_name=args.name, parent_datasets=args.parents)
    if args.tags:
        ds.tags = ds.tags + args.tags
    print('New dataset created id={}'.format(ds.id))
    clear_state({'id': ds.id})
    return ds.id
Exemple #7
0
def ds_squash(args):
    print('Squashing datasets ids={} into target dataset named \'{}\''.format(
        args.ids, args.name))
    print_args(args)
    ds = Dataset.squash(dataset_name=args.name,
                        dataset_ids=args.ids,
                        output_url=args.storage or None)
    print('Squashing completed, new dataset created id={}'.format(ds.id))
    return 0
Exemple #8
0
def ds_upload(args):
    print('uploading local files to dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    ds.upload(verbose=args.verbose,
              output_url=args.storage or None,
              chunk_size=args.chunk_size or -1)
    print('Dataset upload completed')
    return 0
Exemple #9
0
def ds_remove_files(args):
    print('Removing files/folder from dataset id={}'.format(args.id))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    num_files = 0
    for file in args.files:
        num_files += ds.remove_files(dataset_path=file,
                                     recursive=not args.non_recursive,
                                     verbose=args.verbose)
    print('{} files removed'.format(num_files))
    return 0
Exemple #10
0
def ds_verify(args):
    print('Verify dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    files_error = ds.verify_dataset_hash(
        local_copy_path=args.folder or None, skip_hash=args.filesize, verbose=args.verbose)
    if files_error:
        print('Dataset verification completed, {} errors found!'.format(len(files_error)))
    else:
        print('Dataset verification completed successfully, no errors found.')
Exemple #11
0
def ds_add(args):
    print('Adding files/folder to dataset id={}'.format(args.id))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    num_files = 0
    for file in args.files:
        num_files += ds.add_files(path=file,
                                  recursive=not args.non_recursive,
                                  verbose=args.verbose,
                                  dataset_path=args.dataset_folder or None)
    print('{} files added'.format(num_files))
    return 0
Exemple #12
0
def ds_sync(args):
    print('Syncing dataset id={} to local folder {}'.format(
        args.id, args.folder))
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    removed, added = ds.sync_folder(local_path=args.folder,
                                    dataset_path=args.dataset_folder or None,
                                    verbose=args.verbose)

    print('Sync completed: {} files removed, {} added / modified'.format(
        removed, added))
    return 0
Exemple #13
0
def ds_publish(args):
    print('Publishing dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    if not ds.is_final():
        raise ValueError("Cannot publish dataset. Please finalize it first, run `clearml-data close`")

    ds.publish()
    print('Dataset published')
    clear_state()  # just to verify the state is clear
    return 0
Exemple #14
0
def ds_search(args):
    print('Search datasets')
    print_args(args)
    datasets = Dataset.list_datasets(
        dataset_project=args.project or None, partial_name=args.name or None,
        tags=args.tags or None, ids=args.ids or None
    )
    formatting = '{:16} | {:32} | {:19} | {:19} | {:32}'
    print(formatting.format('project', 'name', 'tags', 'created', 'id'))
    print('-' * len(formatting.format('-', '-', '-', '-', '-')))
    for d in datasets:
        print(formatting.format(
            d['project'], d['name'], str(d['tags'] or [])[1:-1], str(d['created']).split('.')[0], d['id']))
    return 0
Exemple #15
0
def ds_close(args):
    print('Finalizing dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    if ds.is_dirty():
        if args.disable_upload:
            raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
        # upload the files
        print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
        ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)

    ds.finalize()
    print('Dataset closed and finalized')
    clear_state()
    return 0
Exemple #16
0
def ds_compare(args):
    print('Comparing target dataset id {} with source dataset id {}'.format(args.target, args.source))
    print_args(args)
    ds = Dataset.get(dataset_id=args.target)
    removed_files = ds.list_removed_files(dataset_id=args.source)
    modified_files = ds.list_modified_files(dataset_id=args.source)
    added_files = ds.list_added_files(dataset_id=args.source)
    if args.verbose:
        print('Removed files:')
        print('\n'.join(removed_files))
        print('\nModified files:')
        print('\n'.join(modified_files))
        print('\nAdded files:')
        print('\n'.join(added_files))
        print('')
    print('Comparison summary: {} files removed, {} files modified, {} files added'.format(
        len(removed_files), len(modified_files), len(added_files)))
    return 0
Exemple #17
0
def ds_get(args):
    print('Download dataset id {}'.format(args.id))
    check_null_id(args)
    print_args(args)
    ds = Dataset.get(dataset_id=args.id)
    if args.overwrite:
        if args.copy:
            # noinspection PyBroadException
            try:
                shutil.rmtree(args.copy)
            except Exception:
                pass
            Path(args.copy).mkdir(parents=True, exist_ok=True)
        elif args.link:
            # noinspection PyBroadException
            try:
                shutil.rmtree(args.link)
            except Exception:
                pass
    if args.copy:
        ds_folder = args.copy
        ds.get_mutable_local_copy(target_folder=ds_folder,
                                  part=args.part,
                                  num_parts=args.num_parts)
    else:
        if args.link:
            Path(args.link).mkdir(parents=True, exist_ok=True)
            # noinspection PyBroadException
            try:
                Path(args.link).rmdir()
            except Exception:
                try:
                    Path(args.link).unlink()
                except Exception:
                    raise ValueError(
                        "Target directory {} is not empty. Use --overwrite.".
                        format(args.link))
        ds_folder = ds.get_local_copy(part=args.part, num_parts=args.num_parts)
        if args.link:
            os.symlink(ds_folder, args.link)
            ds_folder = args.link
    print('Dataset local copy available: {}'.format(ds_folder))
    return 0
Exemple #18
0
def ds_delete(args):
    print('Deleting dataset id={}'.format(args.id))
    print_args(args)
    Dataset.delete(dataset_id=args.id)
    print('Dataset {} deleted'.format(args.id))
    return 0