Python Dataset 예제들, clearml.Dataset Python 예제들

예제 #1

0

파일 보기

파일: evaluate_tlt.py 프로젝트: shomratalon/nvidia-cleamrl-integration

def get_converted_data(dataset_task, conf_file):
    if dataset_task:
        dataset_upload_task = Dataset.get(dataset_id=dataset_task)
    else:
        dataset_upload_task = Dataset.get(dataset_project="Nvidia TLT examples with ClearML",
                                          dataset_name="Example data")
    image_directory_path = (
        get_field_from_config(conf_file, "image_directory_path")
        .strip()
        .strip('"')
        .rpartition("/")[0]
    )
    # noinspection PyBroadException
    try:
        os.makedirs(image_directory_path)
    except Exception:
        pass
    # download the artifact and open it
    saved_dataset = dataset_upload_task.get_local_copy()
    dataset_name = os.listdir(saved_dataset)[0]
    dataset_path = Path(os.path.join(saved_dataset, dataset_name))
    if not dataset_path.is_dir() and dataset_path.suffix in (".zip", ".tgz", ".tar.gz"):
        dataset_suffix = dataset_path.suffix
        if dataset_suffix == ".zip":
            from zipfile import ZipFile

            ZipFile(dataset_path.as_posix()).extractall(path=image_directory_path)
        elif dataset_suffix == ".tar.gz":
            import tarfile

            with tarfile.open(dataset_path.as_posix()) as file:
                file.extractall(image_directory_path)
        elif dataset_suffix == ".tgz":
            import tarfile

            with tarfile.open(dataset_path.as_posix(), mode="r:gz") as file:
                file.extractall(image_directory_path)
        saved_dataset = str(dataset_path)
    else:
        os.system("cp -R {}/* {}".format(saved_dataset, image_directory_path))
    print(saved_dataset)

예제 #2

0

파일 보기

    )

    valid_aug = albumentations.Compose(
        [
            albumentations.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
                max_pixel_value=255.0,
                p=1.0,
            ),
        ],
        p=1.0,
    )

    # download dataset (cached!)
    dataset_folder = Dataset.get(dataset_id=cfg.dataset_id).get_local_copy()

    train_image_paths = glob.glob(
        os.path.join(dataset_folder, f"jpeg-{cfg.image_size}x{cfg.image_size}",
                     "train", "**", "*.jpeg"),
        recursive=True,
    )

    valid_image_paths = glob.glob(
        os.path.join(dataset_folder, f"jpeg-{cfg.image_size}x{cfg.image_size}",
                     "val", "**", "*.jpeg"),
        recursive=True,
    )

    train_targets = [x.split("/")[-2] for x in train_image_paths]
    valid_targets = [x.split("/")[-2] for x in valid_image_paths]

예제 #3

0

파일 보기

task = Task.init(project_name="Image Example",
                 task_name="Image classification CIFAR10")
params = {
    "number_of_epochs": 20,
    "batch_size": 64,
    "dropout": 0.25,
    "base_lr": 0.001,
    "momentum": 0.9,
    "loss_report": 100,
}
params = task.connect(params)  # enabling configuration override by clearml/
print(params)  # printing actual configuration (after override in remote mode)

# The below gets the dataset and stores in the cache. If you want to download the dataset regardless if it's in the
# cache, use the Dataset.get(dataset_name, dataset_project).get_mutable_local_copy(path to download)
dataset_path = Dataset.get(dataset_name=dataset_name,
                           dataset_project=dataset_project).get_local_copy()

# Dataset and Dataloader initializations
transform = transforms.Compose([transforms.ToTensor()])

trainset = datasets.CIFAR10(root=dataset_path,
                            train=True,
                            download=False,
                            transform=transform)
trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=params.get(
                                              "batch_size", 4),
                                          shuffle=True,
                                          num_workers=10)

testset = datasets.CIFAR10(root=dataset_path,

예제 #4

0

파일 보기

def trigger_dataset_func(dataset_id):
    dataset = Dataset.get(dataset_id=dataset_id)
    print('dataset id {} created'.format(dataset.id))

예제 #5

0

파일 보기

def main():
    task = Task.init(project_name="Nvidia Clara examples with ClearML",
                     task_name="Training with Clara")
    task.set_base_docker(
        "nvcr.io/nvidia/clara-train-sdk:v3.1.01 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864"
    )
    parser = argparse.ArgumentParser()
    parser.add_argument("--mmar", "-m", type=str, help="MMAR_ROOT folder")
    parser.add_argument("--train_config",
                        "-c",
                        type=str,
                        help="train config file",
                        required=True)
    parser.add_argument("--env", "-e", type=str, help="environment file")
    parser.add_argument("--log_config", "-l", type=str, help="log config file")
    parser.add_argument("--write_train_stats", action="store_true")
    parser.add_argument("--set", metavar="KEY=VALUE", nargs="*")
    parser.add_argument("--parse_data",
                        action="store_true",
                        help="copy the artifact data")
    parser.add_argument(
        "--images_dir",
        type=str,
        help="Name of the images folder, will be store as a folder in DATA_ROOT."
        "Should be the same to the artifact name in the dataset task")
    parser.add_argument(
        "--labels_dir",
        type=str,
        help="Name of the labels folder, will be store as a folder in DATA_ROOT."
        "Should be the same to the artifact name in the dataset task")
    parser.add_argument(
        "--dataset_task",
        type=str,
        help=
        "The dataset task id, if not provided, a task named `Example data` will be chosen"
    )

    set_env_vars()
    args = parser.parse_args()
    mmar = args.mmar or os.environ["MMAR_ROOT"]
    train_config = args.train_config
    env = args.env
    log_config = args.log_config
    kv = args.set
    images_dir = args.images_dir or ""
    labels_dir = args.labels_dir or ""
    dataset_task = args.dataset_task

    if dataset_task:
        dataset_task = Dataset.get(dataset_id=dataset_task)
    else:
        dataset_task = Dataset.get(
            dataset_project="Nvidia Clara examples with ClearML",
            dataset_name="Example data")
    updated_kv = []
    if dataset_task:
        local_data = dataset_task.get_local_copy()
        for elem in kv:
            if elem.startswith("DATASET_JSON"):
                dataset_name = elem.rpartition("/")[2]
                updated_kv.append("DATASET_JSON={}".format(
                    os.path.join(local_data, dataset_name)))
            else:
                updated_kv.append(elem)

    train_conf = task.connect_configuration(train_config,
                                            name="train",
                                            description="train config file")
    if env:
        env_conf = task.connect_configuration(env,
                                              name="env",
                                              description="environment file")

        with open(env_conf, "r") as env_file:
            import json
            env_dict = json.load(env_file)
            data_root = env_dict.get("DATA_ROOT", "/")
            # noinspection PyBroadException
            try:
                os.makedirs(os.path.join(mmar, data_root))
            except Exception:
                pass
            dataset_json = env_dict.get("DATASET_JSON", "/")
            try:
                dataset_json_file = task.connect_configuration(
                    os.path.join(mmar, dataset_json),
                    name="dataset_json",
                    description="dataset file")
                # noinspection PyBroadException
                try:
                    os.makedirs(dataset_json.rpartition("/")[0])
                except Exception:
                    pass
                os.system("cp -R {} {}".format(
                    dataset_json_file, os.path.join(mmar, dataset_json)))
            except Exception as ex:
                print("Can not connect dataset config file {},\n{}".format(
                    dataset_json, ex))
        for artifact in os.listdir(local_data):
            os.system("cp -R {} {}".format(os.path.join(local_data, artifact),
                                           str(os.path.join(mmar, data_root))))
            if (artifact == images_dir
                    and images_dir) or (artifact == labels_dir and labels_dir):
                os.system("mv {} {}".format(
                    os.path.join(local_data, artifact),
                    os.path.join(mmar, data_root, artifact)))
    else:
        env_conf = env

    log_conf = task.connect_configuration(
        log_config, name="log config",
        description="log config file") if log_config else log_config
    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, train_config.rpartition("/")[0]))
    except Exception:
        pass

    os.system("cp -R {} {}".format(train_conf,
                                   os.path.join(mmar, train_config)))
    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, env.rpartition("/")[0]))
    except Exception:
        pass
    os.system("cp -R {} {}".format(env_conf, os.path.join(mmar, env)))
    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, log_config.rpartition("/")[0]))
    except Exception:
        pass
    os.system("cp -R {} {}".format(log_conf, os.path.join(mmar, log_config)))
    train_mmar()

예제 #6

0

파일 보기

파일: train_dirty.py 프로젝트: tguardi/MLOps

import pandas as pd
import pickle
import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from clearml import Task, Dataset

# Connecting ClearML
task = Task.init(project_name="assignment1", task_name="training_dirty")

# get dataset with split/test
dataset = Dataset.get(dataset_project='assignment1', dataset_name='dirty_data_split')

# get a read only version of the data
dataset_folder = dataset.get_local_copy()

# open the dataset pickle file
with open(dataset_folder + '/dirty_train.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# train the model
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train,y_train)

# store the trained model
joblib.dump(rf, 'rf_dirty.pkl', compress=True)

# print model predication results
result = rf.score(X_test, y_test)

예제 #7

0

파일 보기

import os
import pickle
import pandas as pd
from clearml import Task, Dataset
from sklearn.model_selection import train_test_split

# Connecting ClearML
task = Task.init(project_name="assignment1", task_name="split_transformed")

# get the original dataset
dataset = Dataset.get(dataset_project='assignment1',
                      dataset_name='transformed_dataset')

# create a copy that we can change,
dataset_folder = dataset.get_mutable_local_copy(
    target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset',
    overwrite=True)
print(f"dataset_folder: {dataset_folder}")

df = pd.read_csv(dataset_folder + '/transformed_dataset.csv')

X = df[[
    'GDP per capita', 'Social support', 'Freedom to make life choices',
    'Generosity', 'Perceptions of corruption', 'Social Generosity'
]]
# target
y = df['Healthy life expectancy']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,

예제 #8

0

파일 보기

파일: validate_clara.py 프로젝트: shomratalon/nvidia-cleamrl-integration

def main():
    task = Task.init(project_name="Nvidia Clara examples with ClearML", task_name="Validate Clara")
    task.set_base_docker(
        "nvcr.io/nvidia/clara-train-sdk:v3.1.01 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864"
    )
    parser = argparse.ArgumentParser()
    parser.add_argument('--mmar', '-m', type=str, help='MMAR_ROOT folder', required=True)
    parser.add_argument('--config', '-c', type=str, help='evaluate config file', required=True)
    parser.add_argument('--env', '-e', type=str, help='environment file')
    parser.add_argument('--log_config', '-l', type=str, help='log config file')
    parser.add_argument('--set', metavar='KEY=VALUE', nargs='*')
    parser.add_argument('--models_task', type=str, help='The training task id')
    parser.add_argument("--dataset_task", type=str,
                        help="The dataset task id, if not provided, a task named `Example data` will be chosen")

    set_env_vars()
    args = parser.parse_args()
    mmar = args.mmar or os.environ["MMAR_ROOT"]
    evaluate_config = args.config
    env = args.env
    log_config = args.log_config
    kv = args.set
    dataset_task = args.dataset_task

    evaluate_conf = task.connect_configuration(evaluate_config, name="evaluate", description="evaluate config file")

    if env:
        env_conf = task.connect_configuration(env, name="env", description="environment file")
        if dataset_task:
            dataset_task = Dataset.get(dataset_id=dataset_task)
        else:
            dataset_task = Dataset.get(dataset_project="Nvidia Clara examples with ClearML",
                                       dataset_name="Example data")

        with open(env_conf, "r") as env_file:
            import json
            env_dict = json.load(env_file)
            data_root = env_dict.get("DATA_ROOT", "/")
            # noinspection PyBroadException
            try:
                os.makedirs(os.path.join(mmar, data_root))
            except Exception:
                pass
            dataset_json = env_dict.get("DATASET_JSON", "/")
            try:
                dataset_json_file = task.connect_configuration(os.path.join(mmar, dataset_json),
                                                               name="dataset_json",
                                                               description="dataset file")
                # noinspection PyBroadException
                try:
                    os.makedirs(os.path.join(mmar, dataset_json.rpartition("/")[0]))
                except Exception:
                    pass
                os.system("cp -R {} {}".format(dataset_json_file, os.path.join(mmar, dataset_json)))
            except Exception as ex:
                print("Can not connect dataset config file {},\n{}".format(dataset_json, ex))
        local_data = dataset_task.get_local_copy()
        for artifact in os.listdir(local_data):
            os.system("cp -R {} {}".format(os.path.join(local_data, artifact), str(os.path.join(mmar, data_root))))
            os.system("mv {} {}".format(os.path.join(local_data, artifact), os.path.join(mmar, data_root, artifact)))
    else:
        env_conf = env

    log_conf = task.connect_configuration(log_config, name="log config", description="log config file") if log_config \
        else log_config

    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, evaluate_config.rpartition("/")[0]))
    except Exception:
        pass

    os.system("cp -R {} {}".format(evaluate_conf, os.path.join(mmar, evaluate_config)))
    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, env.rpartition("/")[0]))
    except Exception:
        pass

    os.system("cp -R {} {}".format(env_conf, os.path.join(mmar, env)))
    # noinspection PyBroadException
    try:
        os.makedirs(os.path.join(mmar, log_config.rpartition("/")[0]))
    except Exception:
        pass

    os.system("cp -R {} {}".format(log_conf, os.path.join(mmar, log_config)))

    if args.models_task:
        m_task = Task.get_task(task_id=args.models_task)
        output_models = m_task.get_models().get("output")
        script_path = Path(__file__).parent.absolute()
        dest = [elem.partition("=")[2] for elem in kv if elem.startswith("MMAR_CKPT_DIR")][0]
        # noinspection PyBroadException
        try:
            os.makedirs(dest)
        except Exception:
            pass
        for mdl in output_models:
            m_output = mdl.get_weights_package()
            for model in m_output:
                os.system("mv {} {}".format(os.path.join(script_path, model), dest))

    evaluate_mmar()
    # noinspection PyBroadException
    try:
        for f in Path(os.path.join(mmar, env_dict.get("MMAR_EVAL_OUTPUT_PATH", "/"))).rglob('*'):
            task.upload_artifact(f.name, artifact_object=f)
    except Exception:
        pass

예제 #9

0

파일 보기

파일: 02_remote_dataset_models_save.py 프로젝트: abiller/events

    )

    valid_aug = albumentations.Compose(
        [
            albumentations.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
                max_pixel_value=255.0,
                p=1.0,
            ),
        ],
        p=1.0,
    )

    # download dataset (cached!) <---
    dataset_folder = Dataset.get(dataset_id=DATASET_ID).get_local_copy()

    train_image_paths = glob.glob(
        os.path.join(dataset_folder, f"jpeg-{IMAGE_SIZE}x{IMAGE_SIZE}",
                     "train", "**", "*.jpeg"),
        recursive=True,
    )

    valid_image_paths = glob.glob(
        os.path.join(dataset_folder, f"jpeg-{IMAGE_SIZE}x{IMAGE_SIZE}", "val",
                     "**", "*.jpeg"),
        recursive=True,
    )

    train_targets = [x.split("/")[-2] for x in train_image_paths]
    valid_targets = [x.split("/")[-2] for x in valid_image_paths]

예제 #10

0

파일 보기

import os
import pickle
from clearml import Task, Dataset
from sklearn.model_selection import train_test_split

# Connecting ClearML
task = Task.init(project_name="uchicago", task_name="process dataset")

# get the original dataset
dataset = Dataset.get(dataset_project='uchicago', dataset_name='dataset1')

# create a copy that we can change,
dataset_folder = dataset.get_mutable_local_copy(
    target_folder='working_dataset', overwrite=True)
print(f"dataset_folder: {dataset_folder}")

# open the dataset pickle file
with open(dataset_folder + '/iris_dataset.pkl', 'rb') as f:
    iris = pickle.load(f)

# "process" data (i.e. we split it into train/test)
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# store the dataset split into a pickle file
with open(dataset_folder + '/iris_dataset.pkl', 'wb') as f:
    pickle.dump([X_train, X_test, y_train, y_test], f)

예제 #11

0

파일 보기

파일: clearml_upload_cub200_dataset.py 프로젝트: ecm200/caltech_birds

parser.add_argument(
    '--clearml-project',
    dest='clearml_project',
    type=str,
    help=
    'The name of the clearml project that the dataset will be stored and published to.',
    default='Caltech Birds/Datasets')
parser.add_argument(
    '--clearml-dataset-url',
    dest='clearml_dataset_url',
    type=str,
    help=
    'Location of where the dataset files should be stored. Default is Azure Blob Storage. Format is azure://storage_account/container',
    default='azure://clearmllibrary/datasets')
args = parser.parse_args()

for task_type in ['train', 'test']:
    print('[INFO] Versioning and uploading {0} dataset for CUB200 2011'.format(
        task_type))
    dataset = Dataset.create('cub200_2011_{0}_dataset'.format(task_type),
                             dataset_project=args.clearml_project)
    dataset.add_files(path=os.path.join(args.dataset_basedir, task_type),
                      verbose=False)
    dataset.upload(output_url=args.clearml_dataset_url)
    print('[INFO] {0} Dataset finalized....'.format(task_type), end='')
    dataset.finalize()
    print('done.')

    print('[INFO] {0} Dataset published....'.format(task_type), end='')
    dataset.publish()
    print('done.')

예제 #12

0

파일 보기

파일: train_transformed.py 프로젝트: tguardi/MLOps

import pandas as pd
import pickle
import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from clearml import Task, Dataset

# Connecting ClearML
task = Task.init(project_name="assignment1", task_name="training_transformed")

# get dataset with split/test
dataset = Dataset.get(dataset_project='assignment1',
                      dataset_name='transformed_data_split')

# get a read only version of the data
dataset_folder = dataset.get_local_copy()

# open the dataset pickle file
with open(dataset_folder + '/transformed_train.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# train the model
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

# store the trained model
joblib.dump(rf, 'rf_transformed.pkl', compress=True)

# print model predication results

예제 #13

0

파일 보기

# Download CIFAR dataset and create a dataset with ClearML's Dataset class
from clearml import StorageManager, Dataset

manager = StorageManager()

dataset_path = manager.get_local_copy(
    remote_url="https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")

dataset = Dataset.create(dataset_name="cifar_dataset",
                         dataset_project="dataset_examples")

# Prepare and clean data here before it is added to the dataset

dataset.add_files(path=dataset_path)

# Dataset is uploaded to the ClearML Server by default
dataset.upload()

dataset.finalize()

예제 #14

0

파일 보기

    # override numpy version for colab
    Task.add_requirements('numpy', '1.19.5')
    task = Task.init(
        project_name=project_name,
        task_name='Orig dataset split to sizes',
        task_type=Task.TaskTypes.data_processing,
        output_uri = True,  # auto save everything to ClearML Free
    )

    cfg = DataSplitConf()
    task.connect(cfg, 'dataset split config')

    if cfg.cloud_queue is not None and len(cfg.cloud_queue):
        task.execute_remotely(cfg.cloud_queue)

    input_dataset = Dataset.get(dataset_id=cfg.input_dataset_id)
    input_dataset_folder = input_dataset.get_local_copy()

    # going to do some pruning relative to this folder
    all_subfolders = [d for d in Path(input_dataset_folder).iterdir() if d.is_dir()]
    all_subfolders_rel = [d.relative_to(input_dataset_folder) for d in all_subfolders]

    # prepare an artifact for upload
    results = {image_size: {'train': '', 'val': '', 'norm_info': {}}
               for image_size in cfg.image_size_values}

    for image_size in cfg.image_size_values:
        dataset_name = f"{cfg.dataset_name}_{image_size}x{image_size}_"

        train_files, validation_files = \
            extract_relevant_filenames(input_dataset_folder, image_size)

예제 #15

0

파일 보기

파일: A2_EDA_example.py 프로젝트: abiller/events

        cfg.dataset_metadata_artifact_name]
    metadata = artifact.get()

    for image_size, meta in metadata.items():
        print(f'processing {image_size}...')
        # get augmentations - including mean pixel value
        norm_info = meta['norm_info']
        # get dataset id's
        train_dataset_id = meta.get('train', "")
        valid_dataset_id = meta.get('val', "")
        if not len(train_dataset_id) or not len(valid_dataset_id):
            raise ValueError('Preprocess error: could not find'
                             f' datasets for image size {image_size}')
        # download dataset (cached!)
        try:
            train_dataset_folder = Dataset.get(
                dataset_id=train_dataset_id).get_local_copy()
            valid_dataset_folder = Dataset.get(
                dataset_id=valid_dataset_id).get_local_copy()
        except ValueError as ex:
            raise ValueError(
                f'Preprocess error for datasets for image size {image_size}\n{ex}'
            )

        train_image_paths = [
            f for f in Path(train_dataset_folder).glob('**/*.jp*g')
        ]
        valid_image_paths = [
            f for f in Path(valid_dataset_folder).glob('**/*.jp*g')
        ]

        # show some images

예제 #16

0

파일 보기

import pickle

import joblib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

from clearml import Task, Dataset

# Connecting ClearML
task = Task.init(project_name="uchicago", task_name="training v1")

# get dataset with split/test
dataset = Dataset.get(dataset_project='uchicago', dataset_name='dataset2')

# get a read only version of the data
dataset_folder = dataset.get_local_copy()

# open the dataset pickle file
with open(dataset_folder + '/iris_dataset.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# train the model
model = LogisticRegression(solver='liblinear', multi_class='auto')
model.fit(X_train, y_train)

# store the trained model
joblib.dump(model, 'model.pkl', compress=True)

# print model predication results
result = model.score(X_test, y_test)