Exemple #1
0
    def prepare_data(self):
        download_dataset(self.hparams.data_name, self.hparams.data_root)
        if self.hparams.data_name == "METABRIC":
            alldata = import_dataset_METABRIC(self.hparams.data_root)
        elif self.hparams.data_name == "SYNTHETIC":
            alldata = import_dataset_SYNTHETIC(self.hparams.data_root)
        full_dataset = SurvivalDataset(alldata)

        # make sure the validation set is balanced
        full_indices = range(len(full_dataset))
        full_targets = full_dataset.label
        train_indices, test_indices = train_test_split(full_indices,
                                                       test_size=0.2,
                                                       stratify=full_targets)
        train_indices, val_indices = train_test_split(
            train_indices, test_size=0.2, stratify=full_targets[train_indices])
        train_dataset, val_dataset, test_dataset = (Subset(
            full_dataset, train_indices), Subset(
                full_dataset, val_indices), Subset(full_dataset, test_indices))

        self.full_dataset = full_dataset
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.train_indices = train_indices
        self.val_indices = val_indices
        self.test_indices = test_indices
Exemple #2
0
def load_twins(dataroot=DATA_FOLDER,
               data_format=NUMPY,
               return_sketchy_ites=False,
               return_sketchy_ate=False,
               observe_sketchy_counterfactuals=False):
    """
    Load the Twins dataset

    :param dataroot: path to folder for data
    :param return_sketchy_ites: if True, return sketchy ITEs
    :param return_sketchy_ate: if True, return sketchy ATE
    :param observe_sketchy_counterfactuals: TODO
    :return: dictionary of results
    """
    if observe_sketchy_counterfactuals:
        raise NotImplementedError('Let Brady know if you need this.')

    download_dataset(TWINS_URL,
                     'Twins',
                     dataroot=dataroot,
                     filename=TWINS_FILENAME)
    full_df = pd.read_csv(os.path.join(dataroot, TWINS_FILENAME), index_col=0)

    if data_format == NUMPY:
        d = {
            'w':
            full_df.drop(['T', 'y0', 'y1', 'yf', 'y_cf', 'Propensity'],
                         axis='columns').to_numpy(),
            't':
            full_df['T'].to_numpy(),
            'y':
            full_df['yf'].to_numpy()
        }
    elif data_format == PANDAS:
        d = {
            'w':
            full_df.drop(['T', 'y0', 'y1', 'yf', 'y_cf', 'Propensity'],
                         axis='columns'),
            't':
            full_df['T'],
            'y':
            full_df['yf']
        }

    if return_sketchy_ites or return_sketchy_ate:
        ites = full_df['y1'] - full_df['y0']
        ites_np = ites.to_numpy()
        if return_sketchy_ites:
            d['ites'] = ites if data_format == PANDAS else ites_np
        if return_sketchy_ate:
            d['ate'] = ites_np.mean()

    return d
Exemple #3
0
def main(_):
    if config.mode == "prepare":
        download_dataset(config)
    elif config.mode == "train":
        train(config)
    elif config.mode == "debug":
        config.epochs = 2
        train(config, debug=True)
    elif config.mode == "inpaint":
        config.batch_size = 5
        inpaint(config)
    elif config.mode == "impute":
        pass
def main(_):
  config = flags.FLAGS
  if config.mode == "train":
    assert config.dataset in ("mnist", "cifar10")
    config.in_shape = (config.batch_size, 32, 32, 3)
    config.block_list = [eval(x) for x in config.block_list]
    config.stride_list = [eval(x) for x in config.stride_list]
    config.channel_list = [eval(x) for x in config.channel_list]

    train(config)
  elif config.mode == "debug":
    config.train_steps = 1
    config.viz_steps = 1
    config.block_list = [2, 2, 2]
    config.channel_list = [3, 4, 5]
    config.stride_list = [1, 1, 2]
    config.in_shape = (config.batch_size, 28, 28, 1)
    train(config, debug=True)
  elif config.mode == "prepare":
    download_dataset(config)
  elif config.mode == "sn":
    test_spectral_norm()
  elif config.mode == "iresnet":
    test_iresnet()
  elif config.mode == "trace":
    test_trace_approximation()
  elif config.mode == "inverse":
    test_block_inversion()
  elif config.mode == "squeeze":
    test_squeeze()
  elif config.mode == "trace_sn":
    test_trace_sn()
  elif config.mode == "generate":
    generate(config)
  elif config.mode == "reconstruct":
    reconstruct(config)
Exemple #5
0
def main(*args):
  base_model_path = "/tmp/jax2tf/tf_js_quickdraw"
  dataset_path = os.path.join(base_model_path, "data")
  classes = utils.download_dataset(dataset_path, NB_CLASSES)
  assert len(classes) == NB_CLASSES, classes
  print(f"Classes are: {classes}")
  train_ds, test_ds = utils.load_classes(dataset_path, classes)
  flax_params = train(train_ds, test_ds, classes)

  model_dir = os.path.join(base_model_path, "saved_models")
  # the model must be converted with with_gradient set to True to be able to
  # convert the saved model to TF.js, as "PreventGradient" is not supported
  saved_model_lib.convert_and_save_model(predict, flax_params, model_dir,
                             input_signatures=[tf.TensorSpec([1, 28, 28, 1])],
                             with_gradient=True, compile_model=False,
                             enable_xla=False)
  conversion_dir = os.path.join(base_model_path, 'tfjs_models')
  convert_tf_saved_model(model_dir, conversion_dir)
Exemple #6
0
def load_ihdp_datasets(split='train', n_realizations=100, dataroot=None):
    """
    Load the IHDP data with the nonlinear response surface ("B") that was used
    by Shalit et al. (2017). Description of variables:
        x: covariates (25: 6 continuous and 19 binary)
        t: treatment (binary)
        yf: "factual" (observed) outcome
        ycf: "counterfactual" outcome (random)
        mu0: noiseless potential outcome under control
        mu1: noiseless potential outcome under treatment
        ate: I guess just what was reported in the Hill (2011) paper...
            Not actually accurate. The actual SATEs for the data are the
            following (using (mu1 - mu0).mean()):
                train100:   4.54328871735309
                test100:    4.269906127209613
                all100:     4.406597422281352

                train1000:  4.402550421661204
                test1000:   4.374712690625632
                all1000:    4.388631556143418
        yadd: ???
        ymul: ???

    :param split: 'train', 'test', or 'both'
    :param n_realizations: 100 or 1000 (the two options that the data source provides)
    :return: NpzFile with all the data ndarrays in the 'f' attribute
    """
    if split.lower() not in SPLIT_OPTIONS:
        raise ValueError(
            'Invalid "split" option {} ... valid options: {}'.format(
                split, SPLIT_OPTIONS))
    if isinstance(n_realizations, str):
        n_realizations = int(n_realizations)
    if n_realizations not in N_REALIZATIONS_OPTIONS:
        raise ValueError(
            'Invalid "n_realizations" option {} ... valid options: {}'.format(
                n_realizations, N_REALIZATIONS_OPTIONS))
    if n_realizations == 100:
        if split == 'train' or split == 'all':
            path = download_dataset(IHDP_100_TRAIN_URL,
                                    'IHDP train 100',
                                    dataroot=dataroot)
            train = np.load(path)
        if split == 'test' or split == 'all':
            path = download_dataset(IHDP_100_TEST_URL,
                                    'IHDP test 100',
                                    dataroot=dataroot)
            test = np.load(path)
    elif n_realizations == 1000:
        if split == 'train' or split == 'all':
            path = download_dataset(IHDP_1000_TRAIN_URL,
                                    'IHDP train 1000',
                                    dataroot=dataroot)
            unzip_path = unzip(path)
            train = np.load(unzip_path)
        if split == 'test' or split == 'all':
            path = download_dataset(IHDP_1000_TEST_URL,
                                    'IHDP test 1000',
                                    dataroot=dataroot)
            unzip_path = unzip(path)
            test = np.load(unzip_path)

    if split == 'train':
        return train
    elif split == 'test':
        return test
    elif split == 'all':
        return train, test
Exemple #7
0
    return best_acc


if __name__ == "__main__":
    start = time.time()
    if args.model == 'lstmcnn':
        vocab_list = list(
            """abcdefghijklmnopqrstuvwxyzABSCEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} """
        )
    else:
        vocab_list = list(
            """abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} """
        )

    print('==> download dataset ' + args.dataset)
    download_dataset(args.data_path)

    print('==> make dataset')
    train_dataset = TextDataset(args.data_path,
                                args.seq_length,
                                vocab_list,
                                is_train=True)
    test_dataset = TextDataset(args.data_path,
                               args.seq_length,
                               vocab_list,
                               is_train=False)
    train_loader = data_utils.DataLoader(train_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=args.num_workers)
    test_loader = data_utils.DataLoader(test_dataset,
Exemple #8
0
def main():
    download_dataset()
    vectorize_dataset()
    train_model()
    evaluate_model()
Exemple #9
0
def main(benchmark, force_download, overwrite):

    config_path = os.path.join('benchmarks', benchmark, 'config.yml')
    config = resolve_config(config_path)
    source_folder = config.sources.root

    videos = scan_videos(source_folder, '**')

    if len(videos) == 0 or force_download:

        download_dataset(source_folder, url='https://winnowpre.s3.amazonaws.com/augmented_dataset.tar.xz')

        videos = scan_videos(source_folder, '**')

        print(f'Videos found after download:{len(videos)}')

    if len(videos) > 0:

        print('Video files found. Checking for existing signatures...')

        signatures_path = os.path.join(
                                    config.repr.directory,
                                    'video_signatures', '**',
                                    '**.npy')

        signatures = glob(os.path.join(signatures_path), recursive=True)

        if len(signatures) == 0 or overwrite:

            # Load signatures and labels
            #
            command = f'python extract_features.py -cp {config_path}'
            command = shlex.split(command)
            subprocess.run(command, check=True)

        # Check if signatures were generated properly
        signatures = glob(os.path.join(signatures_path), recursive=True)

        assert len(signatures) > 0, 'No signature files were found.'

        available_df = pd.read_csv(
                                os.path.join(
                                            'benchmarks',
                                            benchmark,
                                            'labels.csv'))
        frame_level = glob(
                        os.path.join(
                                    config.repr.directory,
                                    'frame_level', '**',
                                    '**.npy'), recursive=True)

        signatures_permutations = get_frame_sampling_permutations(
                                                        list(range(1, 6)),
                                                        frame_level)

        scoreboard = dict()

        for fs, sigs in signatures_permutations.items():

            results_analysis = dict()

            for r in np.linspace(0.1, 0.25, num=10):

                results = []

                for i in range(5):

                    mAP, pr_curve = get_result(
                                            available_df,
                                            sigs,
                                            ratio=r,
                                            file_index=frame_level)
                    results.append(mAP)

                results_analysis[r] = results

            scoreboard[fs] = results_analysis

        results_file = open('benchmarks/scoreboard.json', "w")
        json.dump(scoreboard, results_file)
        print('Saved scoreboard on {}'.format('benchmarks/scoreboard.json'))

    else:

        print(f'Please review the dataset (@ {source_folder})')
def download():
    """Download and prepare dataset for training usaging kiwi Challenge structure of data"""
    ut.download_dataset()
def initialize_model():
    model = download_model(model_name)
    # normalizacja do sredniej i odchylenia standardowego zbioru ImageNet
    preprocessing = dict(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225],
                         axis=-3)
    return PyTorchModel(model, bounds=(0, 1), preprocessing=preprocessing)


if __name__ == "__main__":
    model = initialize_model()

    # sciagnij dataset i przetestuj model
    images, labels = download_dataset(base_path=data_path,
                                      examples=examples,
                                      data_format=model.data_format,
                                      bounds=model.bounds,
                                      dimension=(224, 224))

    # wyniki dla niezmodyfikowanych danych
    initial_accuracy = accuracy(model, images, labels)

    # przeprowadz ataki dla kazdego epsilona i oblicz czas ataku
    atak = FGSM()
    duration = {}
    start_attack_time = time.time()
    modified_images = []
    for epsilon in epsilons:
        modified_images.append(atak(model, images, labels, epsilon))
    end_attack_time = time.time()
    duration["czas_ataku__fgsm"] = round(end_attack_time - start_attack_time,
Exemple #12
0
    weight_prune_perc=tune.grid_search([0, 0.1, 0.2, 0.3, 0.4]),
    grad_prune_perc=tune.grid_search([0, 0.1, 0.2, 0.3, 0.4]),
)

tune_config = dict(
    name="SET_DSNN_GS1",
    num_samples=1,
    local_dir=os.path.expanduser("~/nta/results"),
    config=exp_config,
    checkpoint_freq=0,
    checkpoint_at_end=False,
    stop={"training_iteration": 300},
    resources_per_trial={
        "cpu": 1,
        "gpu": 1
    },
    loggers=DEFAULT_LOGGERS,
    verbose=1,
)

# TODO: automatically counts how many GPUs there are

# override when running local for test
if not torch.cuda.is_available():
    exp_config["device"] = "cpu"
    tune_config["resources_per_trial"] = {"cpu": 1}

download_dataset(exp_config)
ray.init()
tune.run(Trainable, **tune_config)
# -*- coding: utf-8 -*-

import os
import matplotlib.pyplot as plt
import numpy as np

from utils import download_dataset


def save_org_images(path, filename, images_org):
        images_org = np.transpose(images_org, axes=(0, 2, 3, 1))
        for i, image in enumerate(images_org):
            plt.imsave(path + os.sep + f"{filename}_{i+1}_original.jpg", image.numpy())


data_path = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)), "imagenet"))
destination_path = data_path + os.sep + "preprocessed"

images, _ = download_dataset(base_path = data_path,
                             examples = 1000,
                             data_format = "channels_first",
                             bounds = (0, 1),
                             dimension = (224, 224))

save_org_images(destination_path, "imagenet", images)
Exemple #14
0
from train_gan import train, get_arguments
import torch
from utils import download_blob, upload_local_directory_to_gcs, download_dataset
from google.cloud import storage

if __name__ == "__main__":
    args = get_arguments()
    print("on GPU:", torch.cuda.is_available())
    print(args)
    print("downloading dataset...")
    download_dataset()
    print("dataset downloaded")

    print("training dataset...")
    train(args)
    print("training done")

    print("uploading results...")
    storage_client = storage.Client()
    bucket = storage_client.bucket("semantic_inpainting")
    upload_local_directory_to_gcs("./checkpoints", bucket, "checkpoints")
    print("results uploaded")