Example #1
0
    target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset',
    overwrite=True)
print(f"dataset_folder: {dataset_folder}")

df = pd.read_csv(dataset_folder + '/transformed_dataset.csv')

X = df[[
    'GDP per capita', 'Social support', 'Freedom to make life choices',
    'Generosity', 'Perceptions of corruption', 'Social Generosity'
]]
# target
y = df['Healthy life expectancy']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# store the dataset split into a pickle file
with open(dataset_folder + '/transformed_train.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)

# create a new version of the dataset with the pickle file
new_dataset = Dataset.create(dataset_project='assignment1',
                             dataset_name='transformed_data_split',
                             parent_datasets=[dataset])
new_dataset.sync_folder(local_path=dataset_folder)
new_dataset.upload()
new_dataset.finalize()

print('we are done')
Example #2
0
    target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset',
    overwrite=True)
print(f"dataset_folder: {dataset_folder}")

df = pd.read_csv(dataset_folder + '/clean_data.csv')

X = df[[
    'GDP per capita', 'Social support', 'Freedom to make life choices',
    'Generosity', 'Perceptions of corruption'
]]
# target
y = df['Healthy life expectancy']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# store the dataset split into a pickle file
with open(dataset_folder + '/clean_train.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)

# create a new version of the dataset with the pickle file
new_dataset = Dataset.create(dataset_project='assignment1',
                             dataset_name='clean_data_split',
                             parent_datasets=[dataset])
new_dataset.sync_folder(local_path=dataset_folder)
new_dataset.upload()
new_dataset.finalize()

print('we are done')
Example #3
0
# create a copy that we can change,
dataset_folder = dataset.get_mutable_local_copy(
    target_folder='working_dataset', overwrite=True)
print(f"dataset_folder: {dataset_folder}")

# open the dataset pickle file
with open(dataset_folder + '/iris_dataset.pkl', 'rb') as f:
    iris = pickle.load(f)

# "process" data (i.e. we split it into train/test)
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# store the dataset split into a pickle file
with open(dataset_folder + '/iris_dataset.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)

# create a new version of the dataset with the pickle file
new_dataset = Dataset.create(dataset_project='uchicago',
                             dataset_name='dataset2',
                             parent_datasets=[dataset])
new_dataset.sync_folder(local_path=dataset_folder)
new_dataset.upload()
new_dataset.finalize()

print('we are done')
parser.add_argument(
    '--clearml-project',
    dest='clearml_project',
    type=str,
    help=
    'The name of the clearml project that the dataset will be stored and published to.',
    default='Caltech Birds/Datasets')
parser.add_argument(
    '--clearml-dataset-url',
    dest='clearml_dataset_url',
    type=str,
    help=
    'Location of where the dataset files should be stored. Default is Azure Blob Storage. Format is azure://storage_account/container',
    default='azure://clearmllibrary/datasets')
args = parser.parse_args()

for task_type in ['train', 'test']:
    print('[INFO] Versioning and uploading {0} dataset for CUB200 2011'.format(
        task_type))
    dataset = Dataset.create('cub200_2011_{0}_dataset'.format(task_type),
                             dataset_project=args.clearml_project)
    dataset.add_files(path=os.path.join(args.dataset_basedir, task_type),
                      verbose=False)
    dataset.upload(output_url=args.clearml_dataset_url)
    print('[INFO] {0} Dataset finalized....'.format(task_type), end='')
    dataset.finalize()
    print('done.')

    print('[INFO] {0} Dataset published....'.format(task_type), end='')
    dataset.publish()
    print('done.')
Example #5
0
# Download CIFAR dataset and create a dataset with ClearML's Dataset class
from clearml import StorageManager, Dataset

manager = StorageManager()

dataset_path = manager.get_local_copy(
    remote_url="https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")

dataset = Dataset.create(dataset_name="cifar_dataset",
                         dataset_project="dataset_examples")

# Prepare and clean data here before it is added to the dataset

dataset.add_files(path=dataset_path)

# Dataset is uploaded to the ClearML Server by default
dataset.upload()

dataset.finalize()
Example #6
0
                    if cfg.delete_earlier_versions:
                        for t in test_if_exists:
                            try:
                                Dataset.delete(t['id'])
                                print(f'Deleted {t}')
                            except ValueError:
                                print(f'Could not delete dataset - has children?')

            except ValueError:
                pass

            print(f'Now with {dataset_name}, creating!')
            if cfg.use_lineage:
                new_dataset = Dataset.create(
                    dataset_name=dataset_name+stage,
                    dataset_project=project_name,
                    parent_datasets=[cfg.input_dataset_id]
                )
                print('...Done')

                # remove other sizes
                for other_folder_rel in all_subfolders_rel:
                    if other_folder_rel != rel_folder:
                        new_dataset.remove_files(str(other_folder_rel)+"/*", verbose=False)
                # remove other stages
                for not_stage in ['train', 'val', 'test']:
                    if not_stage != stage:
                        new_dataset.remove_files(str(rel_folder/not_stage)+"/*", verbose=False)

                # upload should be no-op in this case
                rmed = new_dataset.list_removed_files(cfg.input_dataset_id)