Beispiel #1
0
def get_movielens(root, version='ml_20m'):
    """Download the MovieLens data if it doesn't exist."""

    urls = {
        'ml-latest':
        'http://files.grouplens.org/datasets/movielens/ml-latest.zip',
        'ml-100k': 'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
        'ml-1m': 'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
        'ml-10m': 'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
        'ml-20m': 'http://files.grouplens.org/datasets/movielens/ml-20m.zip'
    }

    assert version in urls.keys(), f"version must be one of {set(urls.keys())}"
    raw_folder = os.path.join(root, version, 'raw')
    processed_folder = os.path.join(root, version, 'processed')
    makedir_exist_ok(raw_folder)
    makedir_exist_ok(processed_folder)

    # download files and extract
    filename = urls[version].rpartition('/')[2]
    print('Downloading...')
    download_url(urls[version], root=raw_folder, filename=filename, md5=None)
    print('Extracting...')
    extract_file(os.path.join(raw_folder, filename), processed_folder)
    print('Done!')
    return Path(os.path.join(processed_folder, version))
    def __init__(self,
                 root,
                 transform=None,
                 target_transform=None,
                 download=False):
        super(Caltech256, self).__init__(os.path.join(root, 'caltech256'))
        makedir_exist_ok(self.root)
        self.transform = transform
        self.target_transform = target_transform

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError('Dataset not found or corrupted.' +
                               ' You can use download=True to download it')

        self.categories = sorted(
            os.listdir(os.path.join(self.root, "256_ObjectCategories")))
        self.index = []
        self.y = []
        for (i, c) in enumerate(self.categories):
            n = len(
                os.listdir(os.path.join(self.root, "256_ObjectCategories", c)))
            self.index.extend(range(1, n + 1))
            self.y.extend(n * [i])
Beispiel #3
0
    def download(self):
        """Download the MNIST data if it doesn't exist in processed_folder already."""

        if self._check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        for url in self.urls:
            filename = url.rpartition('/')[2]
            download_and_extract_archive(url, download_root=self.raw_folder, filename=filename)

        # process and save as torch files
        print('Processing...')

        training_set = (
            read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')),
            read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte'))
        )
        test_set = (
            read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')),
            read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte'))
        )
        with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Beispiel #4
0
    def download(self):
        """Downloads the amazon reviews dataset from the internet and preprocess it."""
        if self.check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        filename = 'amazon_cells_labelled.txt'
        download_url(self.resource, self.raw_folder, filename=filename)

        print('Processing...')

        train_data, test_data, train_targets, test_targets = self.read_csv(
            filename)

        # Converts array data into pytorch tensors
        train_tensor = self.vectorizer(train_data)
        train_targets = torch.from_numpy(np.array(train_targets)).long()
        test_tensor = self.vectorizer(test_data)
        test_targets = torch.from_numpy(np.array(test_targets)).long()

        with open(os.path.join(self.processed_folder, self.training_file),
                  'wb') as f:
            torch.save((train_tensor, train_targets, self.vectorizer), f)
        with open(os.path.join(self.processed_folder, self.test_file),
                  'wb') as f:
            torch.save((test_tensor, test_targets, self.vectorizer), f)

        print('Done!')
    def download(self):
        """Download 2DShapesStructure data if it doesn't exist in processed_folder already."""

        if self._check_raw_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        for url_title, url in self.urls.items():
            filename = url.rpartition('/')[2]
            file_path = os.path.join(self.raw_folder, filename)
            download_url(url, root=self.raw_folder)
            if filename.endswith('.zip'):
                self.extract_zip(zip_path=file_path, remove_finished=True)
            print(f'Fetched {filename}.')

        # training_set = (
        #     self.read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')),
        #     self.read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte'))
        # )
        # test_set = (
        #     self.read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')),
        #     self.read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte'))
        # )
        # with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
        #     torch.save(training_set, f)
        # with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
        #     torch.save(test_set, f)

        print('Done!')
Beispiel #6
0
 def download(self):
     if self.check_exists():
         return True
     else:
         makedir_exist_ok(self.directory)
         for url, md5, filename in self.resources_url:
             download_and_extract_archive(url,
                                          download_root=self.directory,
                                          filename=filename,
                                          md5=md5)
         return False
Beispiel #7
0
    def download(self):

        if self._check_exists():
            return

        makedir_exist_ok(self.root)

        print('Downloading Synthetic Digits...')
        gdown.download(self.url, os.path.join(self.root, self.filename), quiet=False)

        # untar the compressed files
        unpack_archive(os.path.join(self.root, self.filename), self.root)
    def __init__(self,
                 root,
                 target_type="category",
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=False):
        super(Caltech101, self).__init__(os.path.join(root, 'caltech101'))
        self.train = train
        self.dir_name = '101_ObjectCategories_split/train' if self.train else '101_ObjectCategories_split/test'

        makedir_exist_ok(self.root)
        if isinstance(target_type, list):
            self.target_type = target_type
        else:
            self.target_type = [target_type]
        self.transform = transform
        self.target_transform = target_transform

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError('Dataset not found or corrupted.' +
                               ' You can use download=True to download it')

        self.categories = sorted(
            os.listdir(os.path.join(self.root, "101_ObjectCategories")))
        self.categories.remove("BACKGROUND_Google")  # this is not a real class

        # For some reason, the category names in "101_ObjectCategories" and
        # "Annotations" do not always match. This is a manual map between the
        # two. Defaults to using same name, since most names are fine.
        name_map = {
            "Faces": "Faces_2",
            "Faces_easy": "Faces_3",
            "Motorbikes": "Motorbikes_16",
            "airplanes": "Airplanes_Side_2"
        }
        self.annotation_categories = list(
            map(lambda x: name_map[x]
                if x in name_map else x, self.categories))

        self.index = []
        self.y = []
        for (i, c) in enumerate(self.categories):
            file_names = os.listdir(os.path.join(self.root, self.dir_name, c))
            n = len(file_names)
            self.index.extend(file_names)
            self.y.extend(n * [i])

        print(self.train, len(self.index))
Beispiel #9
0
def download_url(url, root, filename=None, md5=None):
    """Download a file from a url and place it in root.
    Args:
        url (str): URL to download file from
        root (str): Directory to place downloaded file in
        filename (str, optional): Name to save the file under. If None, use the basename of the URL
        md5 (str, optional): MD5 checksum of the download. If None, do not check
    """
    from six.moves import urllib

    root = os.path.expanduser(root)
    if not filename:
        filename = os.path.basename(url)
    fpath = os.path.join(root, filename)

    makedir_exist_ok(root)

    # check if file is already present locally
    if check_integrity(fpath, md5):
        print('Using downloaded and verified file: ' + fpath)
    else:  # download the file
        try:
            if 'dropbox' in url:
                # Handle dropbox links differently
                import requests
                headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
                r = requests.get(url, stream=True, headers=headers)
                with open(fpath, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
            elif 'Manual' in url:
                raise urllib.error.URLError(url)
            else:
                print('Downloading ' + url + ' to ' + fpath)
                urllib.request.urlretrieve(url,
                                           fpath,
                                           reporthook=gen_bar_updater())
        except (urllib.error.URLError, IOError) as e:
            if url[:5] == 'https':
                url = url.replace('https:', 'http:')
                print('Failed download. Trying https -> http instead.'
                      ' Downloading ' + url + ' to ' + fpath)
                urllib.request.urlretrieve(url,
                                           fpath,
                                           reporthook=gen_bar_updater())
            else:
                raise e
        # check integrity of downloaded file
        if not check_integrity(fpath, md5):
            raise RuntimeError("File not found or corrupted.")
Beispiel #10
0
    def download(self):
        """Download the COIL20 data if it doesn't exist already."""
        # download files

        if self._check_exists():
            return

        makedir_exist_ok(self.unprocessed_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        url, filename = self.type_list['processed']
        download_url(url, root=self.processed_folder, filename=filename)

        url, filename = self.type_list['unprocessed']
        download_url(url, root=self.unprocessed_folder, filename=filename)
Beispiel #11
0
    def download(self):
        """Download the MNIST data if it doesn't exist in processed_folder already."""

        if self._check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        for url in self.resources:
            filename = url.rpartition('/')[2]
            download_and_extract_archive(url,
                                         download_root=self.raw_folder,
                                         filename=filename,
                                         md5=None)
        print('Downloaded!')
Beispiel #12
0
    def download(self):
        """Download the weather data if it doesn't exist in data folder already."""

        if self._check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        for url in self.urls:
            filename = self.__class__.__name__.lower() + '.csv'
            download_url(url, root=self.raw_folder, filename=filename)

        # process and save as torch files
        print('Processing...')
        filepath = os.path.join(self.raw_folder, filename)
        df = self.process(filepath)

        # Get dataset statistics
        mean_per_day = df.groupby(df.index.dayofyear).mean()
        std_per_day = df.groupby(df.index.dayofyear).std()

        # Split into training and testing
        train, test = {}, {}
        for variable, column in self.variables.items():
            df_variable = df[[column, 'dayofyear', 'year']].dropna()
            train_var = df_variable.groupby(
                'dayofyear', as_index=False).apply(lambda x: x.sample(
                    min(self.num_years_train, len(x)))).droplevel(0)
            test_var = df_variable.drop(train_var.index)
            train[variable] = train_var
            test[variable] = test_var

        # Save data
        training_set = (train, mean_per_day, std_per_day)
        test_set = (test, mean_per_day, std_per_day)
        with open(os.path.join(self.processed_folder, self.training_file),
                  'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.processed_folder, self.test_file),
                  'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Beispiel #13
0
def get_criteo(root):
    """Download the Criteo data if it doesn't exist."""

    url = 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'

    raw_folder = os.path.join(root, 'criteo', 'raw')
    processed_folder = os.path.join(root, 'criteo', 'processed')
    makedir_exist_ok(raw_folder)
    makedir_exist_ok(processed_folder)

    # download files and extract
    filename = url.rpartition('/')[2]
    print('Downloading...')
    download_url(url, root=raw_folder, filename=filename, md5=None)
    print('Extracting...')
    extract_file(os.path.join(raw_folder, filename), processed_folder)
    print('Done!')
    return Path(processed_folder)
Beispiel #14
0
    def download(self):
        
        if self._check_exists():
            return
        
        makedir_exist_ok(os.path.join(self.root, self.raw_folder))
        makedir_exist_ok(os.path.join(self.root, self.processed_folder))

        
        print('Downloading ' + self.url)
        
        filename = self.url.rpartition('/')[2]
        file_path = os.path.join(self.root, self.raw_folder, filename)
        download_and_extract_archive(self.url, download_root=os.path.join(self.root, self.raw_folder), filename=filename)

        # process and save as torch files
        print('Processing...')
        # load MNIST-M images from pkl file
        with open(file_path.replace('.gz', ''), "rb") as f:
            mnist_m_data = pickle.load(f, encoding='bytes')
        mnist_m_train_data = torch.ByteTensor(mnist_m_data[b'train'])
        mnist_m_test_data = torch.ByteTensor(mnist_m_data[b'test'])

        # get MNIST labels
        mnist_train_labels = datasets.MNIST(root=self.mnist_root,
                                            train=True,
                                            download=True).train_labels
        mnist_test_labels = datasets.MNIST(root=self.mnist_root,
                                           train=False,
                                           download=True).test_labels

        # save MNIST-M dataset
        training_set = (mnist_m_train_data, mnist_train_labels)
        test_set = (mnist_m_test_data, mnist_test_labels)
        with open(os.path.join(self.root,
                               self.processed_folder,
                               self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.root,
                               self.processed_folder,
                               self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Beispiel #15
0
    def download(self):
        """Download the FEMNIST data if it doesn't exist in processed_folder already."""
        import shutil

        if self._check_exists():
            return

        utils.makedir_exist_ok(self.raw_folder)
        utils.makedir_exist_ok(self.processed_folder)

        # download files
        for url, md5 in self.resources:
            filename = url.rpartition('/')[2]
            utils.download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)

        # process and save as torch files
        print('Processing...')
        shutil.move(os.path.join(self.raw_folder, self.training_file), self.processed_folder)
        shutil.move(os.path.join(self.raw_folder, self.test_file), self.processed_folder)
Beispiel #16
0
    def load_dataset(self):
        img_filename = "train-images-idx3-ubyte.gz" if self.train\
            else "t10k-images-idx3-ubyte.gz"
        lbl_filename = "train-labels-idx1-ubyte.gz" if self.train\
            else "t10k-labels-idx1-ubyte.gz"

        # Download data if not exist
        makedir_exist_ok(self.root)
        img_filepath = self.download(img_filename)
        lbl_filepath = self.download(lbl_filename)

        # Load image data
        with gzip.open(img_filepath, "rb") as f:
            self.img_data = np.frombuffer(f.read(), np.uint8, offset=16)
        self.img_data = self.img_data.reshape(-1, 1, 28, 28)

        # Load label data
        with gzip.open(lbl_filepath, "rb") as f:
            self.lbl_data = np.frombuffer(f.read(), np.uint8, offset=8)
Beispiel #17
0
    def download_and_process_data(self):
        # if we need to download the all_data.
        if self.download:
            # create the root dir.
            data_utils.makedir_exist_ok(self.root)
            self.all_data_path = os.path.join(self.root, "all_data")
            self.all_data_tgz_file = self.all_data_path + ".tgz"

            if not os.path.exists(self.all_data_tgz_file):
                warnings.warn(
                    "The compresssed file is missing. It will take a while (at least hours) to download, uncompress and process the data."
                )

                # download and uncompress data.
                print("download and extract archive.")
                for name, url in self.download_urls.items():
                    torchvision.datasets.utils.download_and_extract_archive(
                        url=url,
                        download_root=self.root,
                        filename=name,
                        md5=None)

                # process the data.
                self._process_data(self.all_data_path)
            else:
                print("Files already downloaded.")
                if not os.path.exists(self.all_data_path):
                    tar_decompress_folder(self.all_data_tgz_file)

        # perform sampling from the all_data and remove invalid information.
        data_path = self._sample_data()
        # data_path = self._remove_invalid_user(data_path)

        # split the dataset.
        splitted_data_paths = self._split_data(data_path)

        # display the stat of the train/test data.
        (
            self.data_path,
            self.meta_data_path,
            self.meta_data,
        ) = self._load_meta_data_and_display_stat(splitted_data_paths)
Beispiel #18
0
    def download(self):
        "Download MNIST-M if it does not exists and put into processed folder"

        # import packages
        import gzip
        import pickle
        from torchvision import datasets

        if self._check_exists():
            return

        makedir_exist_ok(self.raw_folder)
        makedir_exist_ok(self.processed_folder)

        # download files
        filename = self.url.rpartition('/')[2]
        file_path = os.path.join(self.raw_folder, filename)
        download_and_extract_archive(self.url, download_root=self.raw_folder)

        # process and save as torch files
        print('Processing...')

        # load MNIST-M images from pkl file
        with open(file_path.replace('.gz', ''), 'rb') as f:
            mnistm_m_data = pickle.load(f, encoding='bytes')

        mnistm_m_train_data = torch.ByteTensor(mnistm_m_data[b"train"])
        mnistm_m_test_data = torch.ByteTensor(mnistm_m_data[b"test"])

        # get MNIST labels
        mnist_train_labels = MNIST(root=self.mnist_root, train=True, download=True).train_labels
        mnist_test_labels = MNIST(root=self.mnist_root, train=False, download=True).test_labels

        # save MNIST-M dataset
        training_set = (mnistm_m_train_data, mnist_train_labels)
        test_set = (mnistm_m_test_data, mnist_test_labels)
        with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
            torch.save(test_set, f)

        print('Done!')
Beispiel #19
0
    def __init__(self, root, transform, download):
        self.root = root
        self.transform = transform
        self.download = download

        self.training_file = 'training.pt'
        self.training_dir_path = os.path.join(self.root,
                                              self.__class__.__name__)
        self.training_file_path = os.path.join(self.training_dir_path,
                                               self.training_file)

        if not os.path.exists(self.training_file_path):
            print("generating extended training data...")
            makedir_exist_ok(self.training_dir_path)

            self.data, self.targets = self.generate_extended_data()

            with open(self.training_file_path, 'wb') as f:
                torch.save((self.data, self.targets), f)
        else:
            print("loading extended training data from file...")
            self.data, self.targets = torch.load(self.training_file_path)
    def load_dataset(self, train):
        img_filename = "train-images-idx3-ubyte.gz" if train\
            else "t10k-images-idx3-ubyte.gz"
        lbl_filename = "train-labels-idx1-ubyte.gz" if train\
            else "t10k-labels-idx1-ubyte.gz"

        # Download data if not exist
        # root = '/misc/kcgscratch1/ChoGroup/resnick/vidcaps/MovingMNist/'
        makedir_exist_ok(self.root)
        img_filepath = self.download(self.root, img_filename)
        lbl_filepath = self.download(self.root, lbl_filename)

        # Load image data
        with gzip.open(img_filepath, "rb") as f:
            img_data = np.frombuffer(f.read(), np.uint8, offset=16)
        img_data = img_data.reshape(-1, 1, 28, 28)
        img_data = img_data.astype(np.float) / 255.

        # Load label data
        with gzip.open(lbl_filepath, "rb") as f:
            lbl_data = np.frombuffer(f.read(), np.uint8, offset=8)
        lbl_data = lbl_data.astype(np.int)
        return img_data, lbl_data
Beispiel #21
0
def load_dataset(root, training=True):
    img_filename = "train-images-idx3-ubyte.gz" if training\
        else "t10k-images-idx3-ubyte.gz"
    lbl_filename = "train-labels-idx1-ubyte.gz" if training\
        else "t10k-labels-idx1-ubyte.gz"

    # Download data if not exist
    makedir_exist_ok(root)
    img_filepath = download(root, img_filename)
    lbl_filepath = download(root, lbl_filename)

    # Load image data
    with gzip.open(img_filepath, "rb") as f:
        imgs = np.frombuffer(f.read(), np.uint8, offset=16)
    imgs = imgs.reshape(-1, 1, 28, 28)

    # Load label data
    with gzip.open(lbl_filepath, "rb") as f:
        lbls = np.frombuffer(f.read(), np.uint8, offset=8)

    print('Wat: ', len(lbls), lbl_filename)
    print('Wat2: ', len(imgs), img_filename)

    return imgs, lbls
 def download(self):
     makedir_exist_ok(self.root)
     for url in self.urls:
         filename = url.rpartition('/')[-1]
         file_path = os.path.join(self.root, filename)
         download_url(url, root=self.root, filename=filename, md5=None)
    def prepare_colored_mnist(self):
        colored_mnist_dir = os.path.join(self.root, 'ColoredMNIST')
        if os.path.exists(os.path.join(colored_mnist_dir, 'train1.pt')) \
            and os.path.exists(os.path.join(colored_mnist_dir, 'train2.pt')) \
            and os.path.exists(os.path.join(colored_mnist_dir, 'test.pt')):
            print('Colored MNIST dataset already exists')
            return

        print('Preparing Colored MNIST')
        train_mnist = datasets.mnist.MNIST(self.root,
                                           train=True,
                                           download=True)

        train1_set = []
        train2_set = []
        test_set = []
        for idx, (im, label) in enumerate(train_mnist):
            if idx % 10000 == 0:
                print(f'Converting image {idx}/{len(train_mnist)}')
            im_array = np.array(im)

            # Assign a binary label y to the image based on the digit
            binary_label = 0 if label < 5 else 1

            # Flip label with 25% probability
            if np.random.uniform() < 0.25:
                binary_label = binary_label ^ 1

            # Color the image either red or green according to its possibly flipped label
            color_red = binary_label == 0

            # Flip the color with a probability e that depends on the environment
            if idx < 20000:
                # 20% in the first training environment
                if np.random.uniform() < 0.2:
                    color_red = not color_red
            elif idx < 40000:
                # 10% in the first training environment
                if np.random.uniform() < 0.1:
                    color_red = not color_red
            else:
                # 90% in the test environment
                if np.random.uniform() < 0.9:
                    color_red = not color_red

            colored_arr = color_grayscale_arr(im_array, red=color_red)

            if idx < 20000:
                train1_set.append((Image.fromarray(colored_arr), binary_label))
            elif idx < 40000:
                train2_set.append((Image.fromarray(colored_arr), binary_label))
            else:
                test_set.append((Image.fromarray(colored_arr), binary_label))

            # Debug
            # print('original label', type(label), label)
            # print('binary label', binary_label)
            # print('assigned color', 'red' if color_red else 'green')
            # plt.imshow(colored_arr)
            # plt.show()
            # break

        dataset_utils.makedir_exist_ok(colored_mnist_dir)
        torch.save(train1_set, os.path.join(colored_mnist_dir, 'train1.pt'))
        torch.save(train2_set, os.path.join(colored_mnist_dir, 'train2.pt'))
        torch.save(test_set, os.path.join(colored_mnist_dir, 'test.pt'))
Beispiel #24
0
    def _split_data(self, data_path):
        print(
            f"split the data for the path={data_path}, split_by_sample={self.split_by_sample}."
        )

        if not self.split_by_sample:  # i.e. we will split by user.
            # 1 pass through all the json files to instantiate arr
            # containing all possible (user, .json file name) tuples

            user_files = []
            for f in os.listdir(data_path):
                file_dir = os.path.join(data_path, f)
                with open(file_dir, "r") as inf:
                    # Load data into an OrderedDict, to prevent ordering changes
                    # and enable reproducibility
                    data = json.load(inf,
                                     object_pairs_hook=collections.OrderedDict)

                include_hierarchy = "hierarchies" in data
                if include_hierarchy:
                    user_files.extend([
                        (u, h, ns, f)
                        for (u, h,
                             ns) in zip(data["users"], data["hierarchies"],
                                        data["num_samples"])
                    ])
                else:
                    user_files.extend([
                        (u, ns, f)
                        for (u, ns) in zip(data["users"], data["num_samples"])
                    ])

            # randomly sample from user_files to pick training set users
            num_users = len(user_files)
            num_train_users = int(self.train_split_ratio * num_users)
            indices = [i for i in range(num_users)]
            train_indices = self.rng_state.sample(indices, num_train_users)
            train_blist = [False for i in range(num_users)]
            for i in train_indices:
                train_blist[i] = True

            train_user_files = []
            test_user_files = []
            for i in range(num_users):
                if train_blist[i]:
                    train_user_files.append(user_files[i])
                else:
                    test_user_files.append(user_files[i])

            # todo....
            assert False, "TODO..."
        else:
            train_data_path = data_path + "_train"
            test_data_path = data_path + "_test"
            train_meta_data_path = train_data_path + "_meta.json"
            test_meta_data_path = test_data_path + "_meta.json"

            is_finished_splitting = (os.path.exists(train_data_path)
                                     and os.path.exists(test_data_path)
                                     and len(os.listdir(train_data_path))
                                     == len(os.listdir(test_data_path))
                                     and len(os.listdir(train_data_path)) > 100
                                     and os.path.exists(train_meta_data_path)
                                     and os.path.exists(test_meta_data_path))
            if is_finished_splitting:
                print(f"exist the splitted files (exit).")
                return train_data_path, test_data_path

            print("\tsplitting the dataset into train/test by sample.")
            data_utils.makedir_exist_ok(train_data_path)
            data_utils.makedir_exist_ok(test_data_path)
            meta_train = collections.defaultdict(list)
            meta_test = collections.defaultdict(list)

            for f in os.listdir(data_path):
                file_dir = os.path.join(data_path, f)
                with open(file_dir, "r") as inf:
                    # Load data into an OrderedDict, to prevent ordering changes
                    # and enable reproducibility
                    data = json.load(inf,
                                     object_pairs_hook=collections.OrderedDict)

                print(f'\twe have {len(data["users"])} users.')
                for i, u in enumerate(data["users"]):
                    curr_num_samples = len(data["user_data"][u]["y"])
                    if curr_num_samples >= 2:
                        # ensures number of train and test samples both >= 1
                        num_train_samples = max(
                            1, int(self.train_split_ratio * curr_num_samples))
                        if curr_num_samples == 2:
                            num_train_samples = 1

                        indices = [j for j in range(curr_num_samples)]
                        train_indices = self.rng_state.sample(
                            indices, num_train_samples)
                        test_indices = [
                            i for i in range(curr_num_samples)
                            if i not in train_indices
                        ]

                        # if we have a valid train/test split.
                        if len(train_indices) >= 1 and len(test_indices) >= 1:
                            user_data_train = {"x": [], "y": []}
                            user_data_test = {"x": [], "y": []}

                            train_blist = [
                                False for j in range(curr_num_samples)
                            ]
                            test_blist = [
                                False for j in range(curr_num_samples)
                            ]
                            for j in train_indices:
                                train_blist[j] = True
                            for j in test_indices:
                                test_blist[j] = True

                            for j in range(curr_num_samples):
                                if train_blist[j]:
                                    user_data_train["x"].append(
                                        data["user_data"][u]["x"][j])
                                    user_data_train["y"].append(
                                        data["user_data"][u]["y"][j])
                                elif test_blist[j]:
                                    user_data_test["x"].append(
                                        data["user_data"][u]["x"][j])
                                    user_data_test["y"].append(
                                        data["user_data"][u]["y"][j])

                            # save the data to disk.
                            all_data_train = {
                                "user_data":
                                user_data_train,
                                "hierarchies":
                                data["hierarchies"][i]
                                if "hierarchies" in data else None,
                            }
                            all_data_test = {
                                "user_data":
                                user_data_test,
                                "hierarchies":
                                data["hierarchies"][i]
                                if "hierarchies" in data else None,
                            }
                            meta_train["users"].append(u)
                            meta_test["users"].append(u)
                            meta_train["num_samples"].append(
                                len(user_data_train["x"]))
                            meta_test["num_samples"].append(
                                len(user_data_test["x"]))

                            # save to path.
                            jump_json(
                                all_data_train,
                                os.path.join(train_data_path, f"{u}.json"),
                            )
                            jump_json(
                                all_data_test,
                                os.path.join(test_data_path, f"{u}.json"))
                print(
                    f"\tsplitted {f}. processed {len(meta_train['users'])} users."
                )
            # save the meta data to the disk.
            jump_json(meta_train, train_meta_data_path)
            jump_json(meta_test, test_meta_data_path)
            return train_data_path, test_data_path
    def preprocess(self, fold_list):

        makedir_exist_ok(self.audio_folder)
        makedir_exist_ok(self.processed_folder)

        self._read_metadata(fold_list)

        # pre-process
        file_names = self.metadata['file_names']
        labels = self.metadata['labels']
        folders = self.metadata['folders']

        data = []
        targets = []

        start = time.time()
        for idx, (file_name, label,
                  folder) in enumerate(zip(file_names, labels, folders)):
            wav_file_path = os.path.join(self.audio_folder,
                                         "fold{}".format(folder), file_name)

            sound, sr = librosa.load(wav_file_path,
                                     mono=True,
                                     res_type='kaiser_fast')

            #             # 4초 임시데이터 생성
            #             tempSound = torch.zeros(4*8000) # 4sec. * 8KHz
            #             if len(sound) < 4*8000:
            #                 tempSound[:len(sound)] = torch.FloatTensor(sound[:])
            #             else:
            #                 tempSound[:] = torch.FloatTensor(sound[:4*8000])

            #             sound = tempSound
            target = label

            #             frame_length = 0.025                        # 25(ms)
            #             frame_stride = 0.010                        # 10(ms)
            #             n_fft = int(round(sr*frame_length))        # 200 (sample)
            #             hop_length = int(round(sr*frame_stride))    # 80 (sample)

            X, sample_rate = sound, sr

            # stft = np.abs(librosa.stft(X))
            # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            # chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            # mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
            # contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
            # tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)

            melspec = librosa.feature.melspectrogram(y=X,
                                                     sr=sample_rate,
                                                     n_mels=40)

            # logspec = np.log(melspec)
            logspec = librosa.power_to_db(melspec, ref=np.max)
            log_mel = np.mean(logspec.T, axis=0)

            # S = np.hstack([mfccs, chroma, mel, contrast])
            S = np.hstack([log_mel])

            # S = np.mean(librosa.feature.melspectrogram(y=sound, n_mels=40, n_fft=n_fft, hop_length=hop_length).T, axis=0)
            # mfccs = np.mean(librosa.feature.mfcc(y=sound.numpy(), sr=sr, n_mfcc=40).T, axis=0)

            # print(S.shape, S)
            # print(mfccs.shape, mfccs)
            # break

            S = torch.FloatTensor(S)

            data.append(S)
            targets.append(target)

            end = time.time()
            if idx % 100 == 0:
                print("(%s) %04d/%04d processed. (%.4f (sec.))" %
                      ("train" if self.train else "test", idx + 1,
                       len(file_names), (end - start)))

        if self.train:
            training_set = (torch.stack(data), torch.LongTensor(targets))

            with open(os.path.join(self.processed_folder, self.training_file),
                      'wb') as f:
                torch.save(training_set, f)
        else:
            test_set = (torch.stack(data), torch.LongTensor(targets))

            with open(os.path.join(self.processed_folder, self.test_file),
                      'wb') as f:
                torch.save(test_set, f)

        print('Done!')
Beispiel #26
0
    def _remove_invalid_user(self, data_path):
        if self.min_samples_per_user == 0:
            print("skip the filtering due to min_samples_per_user=0.")
            return data_path

        # build folder and filter user.
        folder_name = os.path.join(self.root, "filtered_" + data_path)
        folder_tgz = folder_name + ".tgz"

        # init.
        print(f"filter sampled data and will save to {folder_name}")
        data_utils.makedir_exist_ok(folder_name)

        is_finished_filtering = len(os.listdir(folder_name)) == len(
            os.listdir(self.all_data_path))
        if os.path.exists(folder_tgz) and is_finished_filtering:
            print("has finished filtering and compressed the sampled data.")
            return folder_name

        # start filtering.
        if not is_finished_filtering:
            for f in [f for f in os.listdir(data_path) if f.endswith(".json")]:
                users = []
                hierarchies = []
                num_samples = []
                user_data = {}

                # load the data.
                file_dir = os.path.join(data_path, f)
                data = load_json(file_dir)

                num_users = len(data["users"])
                for i in range(num_users):
                    curr_user = data["users"][i]
                    curr_hierarchy = None
                    if "hierarchies" in data:
                        curr_hierarchy = data["hierarchies"][i]
                    curr_num_samples = data["num_samples"][i]
                    if curr_num_samples >= self.min_samples_per_user:
                        user_data[curr_user] = data["user_data"][curr_user]
                        users.append(curr_user)
                        if curr_hierarchy is not None:
                            hierarchies.append(curr_hierarchy)
                        num_samples.append(data["num_samples"][i])

                # save the valid data.
                all_data = {}
                all_data["users"] = users
                if len(hierarchies) == len(users):
                    all_data["hierarchies"] = hierarchies
                all_data["num_samples"] = num_samples
                all_data["user_data"] = user_data

                file_path = os.path.join(folder_name, f)
                print(
                    f"\tsave filtered and sampled json file to the path={file_path}."
                )
                jump_json(all_data, file_path=file_path)

        print(f"save data to tgz file.")
        tar_compress_folder(folder_name)
        return folder_name
Beispiel #27
0
    def _process_data(self, final_data_path):
        # get file dirs.
        # create the intermediate dir.
        intermediate_path = os.path.join(self.root, "intermediate")
        if not os.path.exists(intermediate_path):
            data_utils.makedir_exist_ok(intermediate_path)

        # extract file directories of images (by class).
        class_file_dirs_path = os.path.join(intermediate_path,
                                            "class_file_dirs.pickle")
        if not os.path.exists(class_file_dirs_path):
            print("extract file directories of images by class.")
            class_files = []  # (class, file directory)

            # init dir.
            class_dir = os.path.join(self.root, "by_class")
            classes = [c for c in os.listdir(class_dir) if len(c) == 2]

            # extract files.
            for cl in classes:
                cldir = os.path.join(class_dir, cl)
                subcls = [
                    s for s in os.listdir(cldir)
                    if (("hsf" in s) and ("mit" not in s))
                ]

                for subcl in subcls:
                    subcldir = os.path.join(cldir, subcl)
                    image_dirs = [
                        os.path.join(subcldir, i) for i in os.listdir(subcldir)
                    ]

                    for image_dir in image_dirs:
                        class_files.append((cl, image_dir))
            print(
                f"extract file by class: # of samples={len(class_files)}. saving to path={class_file_dirs_path}."
            )
            save_obj(class_files, class_file_dirs_path)

        # extract file directories of images (by user).
        writer_file_dirs_path = os.path.join(intermediate_path,
                                             "writer_file_dirs.pickle")
        if not os.path.exists(writer_file_dirs_path):
            print("extract file directories of images by writer.")
            writer_files = []  # (writer, file directory)
            writer_dir = os.path.join(self.root, "by_write")
            writer_parts = os.listdir(writer_dir)

            # init dir.
            for writer_part in writer_parts:
                writers_dir = os.path.join(writer_dir, writer_part)
                writers = os.listdir(writers_dir)

                for writer in writers:
                    _writer_dir = os.path.join(writers_dir, writer)
                    wtypes = os.listdir(_writer_dir)

                    for wtype in wtypes:
                        type_dir = os.path.join(_writer_dir, wtype)
                        images = os.listdir(type_dir)
                        image_dirs = [
                            os.path.join(type_dir, i) for i in images
                        ]

                        for image_dir in image_dirs:
                            writer_files.append((writer, image_dir))
            print(
                f"extract file by writer: # of samples={len(writer_files)}. saving to path={writer_file_dirs_path}."
            )
            save_obj(writer_files, writer_file_dirs_path)

        # get the hash.
        # get the hash for class.
        class_file_hashes_path = os.path.join(intermediate_path,
                                              "class_file_hashes.pickle")
        if not os.path.exists(class_file_hashes_path):
            # init.
            count = 0
            class_file_hashes = []
            class_file_dirs = load_obj(class_file_dirs_path)
            print("get the image hashes (by class).")

            # get the hashes.
            for tup in class_file_dirs:
                if count % 100000 == 0:
                    print("\thashed %d class images" % count)

                (cclass, cfile) = tup
                chash = hashlib.md5(open(cfile, "rb").read()).hexdigest()
                class_file_hashes.append((cclass, cfile, chash))
                count += 1
            save_obj(class_file_hashes, class_file_hashes_path)

        # get the hash for writer.
        writer_file_hashes_path = os.path.join(intermediate_path,
                                               "writer_file_hashes.pickle")
        if not os.path.exists(writer_file_hashes_path):
            # init.
            count = 0
            writer_file_hashes = []
            writer_file_dirs = load_obj(writer_file_dirs_path)
            print("get the image hashes (by writer).")

            for tup in writer_file_dirs:
                if count % 100000 == 0:
                    print("hashed %d write images" % count)

                (cclass, cfile) = tup
                chash = hashlib.md5(open(cfile, "rb").read()).hexdigest()
                writer_file_hashes.append((cclass, cfile, chash))
                count += 1
            save_obj(writer_file_hashes, writer_file_hashes_path)

        # check the hash and assign class labels to writer.
        class_file_hashes = load_obj(
            class_file_hashes_path)  # each elem is (class, file dir, hash)
        writer_file_hashes = load_obj(
            writer_file_hashes_path)  # each elem is (writer, file dir, hash)
        writer_with_class_path = os.path.join(intermediate_path,
                                              "writer_with_class.pickle")
        if not os.path.exists(writer_with_class_path):
            print("assigning class labels to writer images.")
            class_hash_dict = {}
            for i in range(len(class_file_hashes)):
                (c, f, h) = class_file_hashes[len(class_file_hashes) - i - 1]
                class_hash_dict[h] = (c, f)
            writer_classes = []
            for tup in writer_file_hashes:
                (w, f, h) = tup
                writer_classes.append((w, f, class_hash_dict[h][0]))
            save_obj(writer_classes, writer_with_class_path)

        # grouping images by writer.
        writer_class = load_obj(writer_with_class_path)
        images_by_writer_path = os.path.join(intermediate_path,
                                             "images_by_writer.pickle")
        if not os.path.exists(images_by_writer_path):
            print("write images_by_writer")
            # each entry is a (writer, [list of (file, class)]) tuple
            writers = []
            cimages = []
            (cw, _, _) = writer_class[0]
            for (w, f, c) in writer_class:
                if w != cw:
                    writers.append((cw, cimages))
                    cw = w
                    cimages = [(f, c)]
                cimages.append((f, c))
            writers.append((cw, cimages))
            # save obj.
            save_obj(writers, images_by_writer_path)

        # create for the final data json.
        # converts a list of (writer, [list of (file,class)]) tuples into a json object
        # of the form:
        #   {users: [bob, etc], num_samples: [124, etc.],
        #   user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}}
        # where 'img_' is a vectorized representation of the corresponding image.
        def relabel_class(c):
            """
            maps hexadecimal class value (string) to a decimal number
            returns:
            - 0 through 9 for classes representing respective numbers
            - 10 through 35 for classes representing respective uppercase letters
            - 36 through 61 for classes representing respective lowercase letters
            """
            if c.isdigit() and int(c) < 40:
                return int(c) - 30
            elif int(c, 16) <= 90:  # uppercase
                return int(c, 16) - 55
            else:
                return int(c, 16) - 61

        def write_to_json_file(users, num_samples, user_data, json_index):
            all_data = {}
            all_data["users"] = users
            all_data["num_samples"] = num_samples
            all_data["user_data"] = user_data

            file_name = "all_data_%d.json" % json_index
            file_path = os.path.join(final_data_path, file_name)
            print("writing %s" % file_name)
            jump_json(all_data, file_path)

        def write_to_json_files(all_writers, max_writers):
            writer_count = 0
            json_index = 0
            users = []
            num_samples = []
            user_data = {}

            for (w, l) in all_writers:
                users.append(w)
                num_samples.append(len(l))
                user_data[w] = {"x": [], "y": []}

                size = self.img_size, self.img_size  # original image size is 128, 128
                for (f, c) in l:
                    img = Image.open(f)
                    gray = img.convert("L")
                    gray.thumbnail(size, Image.ANTIALIAS)
                    arr = np.asarray(gray).copy()
                    vec = arr.flatten()
                    vec = vec / 255  # scale all pixel values to between 0 and 1
                    vec = vec.tolist()
                    nc = relabel_class(c)

                    user_data[w]["x"].append(vec)
                    user_data[w]["y"].append(nc)
                writer_count += 1

                # redirect a new json file.
                if writer_count == max_writers:
                    write_to_json_file(users, num_samples, user_data,
                                       json_index)

                    # reinit.
                    writer_count = 0
                    json_index += 1
                    users[:] = []
                    num_samples[:] = []
                    user_data.clear()

            # in case we have something left.
            if writer_count > 0:
                write_to_json_file(users, num_samples, user_data, json_index)

        # start the final processing.
        if not os.path.exists(final_data_path):
            data_utils.makedir_exist_ok(final_data_path)

        MAX_WRITERS = 100  # max number of writers per json file.
        writers = load_obj(images_by_writer_path)
        num_json_files = int(math.ceil(len(writers) / MAX_WRITERS))

        if (len([x for x in os.listdir(final_data_path) if "json" in x]) !=
                num_json_files):
            print(
                f"final step for creating all data 1: save the json files to disks. we have {num_json_files} json files."
            )
            write_to_json_files(writers, MAX_WRITERS)

        if not os.path.exists(self.all_data_tgz_file):
            print(
                f"final step for creating all data 2: save them to tgz file.")
            tar_compress_folder(final_data_path)
Beispiel #28
0
from torchvision.datasets.utils import download_url, makedir_exist_ok

raw_folder = './emnist_data'

url = 'http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip'
md5 = "58c8d27c78d21e728a6bc7b3cc06412e"

makedir_exist_ok(raw_folder)

# download files
print('Downloading zip archive')
download_url(url, root=raw_folder, filename="emnist.zip", md5=md5)
Beispiel #29
0
    def _sample_data(self):
        new_user_count = 0  # for iid case
        folder_name = os.path.join(
            self.root,
            f"sampled_{'iid' if self.is_iid_sample else 'niid'}_data_fraction-{self.data_fraction}{f'_user_fraction-{self.user_fraction}' if self.is_iid_sample else ''}",
        )
        folder_tgz = folder_name + ".tgz"

        # build or extract.
        print(f"sample data and will save to {folder_name}")
        data_utils.makedir_exist_ok(folder_name)

        is_finished_sampling = len(os.listdir(folder_name)) == len(
            os.listdir(self.all_data_path))
        if os.path.exists(folder_tgz) and is_finished_sampling:
            print("has finished sampling and compressed the sampled data.")
            return folder_name
        elif os.path.exists(folder_tgz):
            print(
                "the sampling has not been finished (but we have the tgz file). So let's decompress the tgz file."
            )
            tar_decompress_folder(folder_tgz)
            return folder_name

        # (rough) check the number of files in folder_name.
        if not is_finished_sampling:
            print(
                "the number of sampled json file is incorrect. sample it again."
            )

            for f in os.listdir(self.all_data_path):
                file_dir = os.path.join(self.all_data_path, f)
                with open(file_dir, "r") as inf:
                    # Load data into an OrderedDict, to prevent ordering changes
                    # and enable reproducibility
                    data = json.load(inf,
                                     object_pairs_hook=collections.OrderedDict)

                # get some meta info.
                num_users = len(data["users"])
                tot_num_samples = sum(data["num_samples"])
                num_new_samples = int(self.data_fraction * tot_num_samples)
                hierarchies = None

                # if it is iid sample.
                if self.is_iid_sample:
                    raw_list = list(data["user_data"].values())
                    raw_x = [elem["x"] for elem in raw_list]
                    raw_y = [elem["y"] for elem in raw_list]
                    x_list = [item for sublist in raw_x
                              for item in sublist]  # flatten raw_x
                    y_list = [item for sublist in raw_y
                              for item in sublist]  # flatten raw_y

                    # get new users and new indices.
                    num_new_users = max(
                        int(round(self.user_fraction * num_users)), 1)
                    indices = [i for i in range(tot_num_samples)]
                    new_indices = self.rng_state.sample(
                        indices, num_new_samples)
                    users = [
                        str(i + new_user_count) for i in range(num_new_users)
                    ]

                    # get the new data and divide them (iid).
                    user_data = dict((user, collections.defaultdict(list))
                                     for user in users)
                    all_x_samples = [x_list[i] for i in new_indices]
                    all_y_samples = [y_list[i] for i in new_indices]
                    x_groups = iid_divide(all_x_samples, num_new_users)
                    y_groups = iid_divide(all_y_samples, num_new_users)

                    # assign the info.
                    for i in range(num_new_users):
                        user_data[users[i]]["x"] = x_groups[i]
                        user_data[users[i]]["y"] = y_groups[i]
                    num_samples = [len(user_data[u]["y"]) for u in users]
                    new_user_count += num_new_users
                else:
                    ctot_num_samples = 0

                    users = data["users"]
                    users_and_hiers = None
                    if "hierarchies" in data:
                        users_and_hiers = list(zip(users, data["hierarchies"]))
                        self.rng_state.shuffle(users_and_hiers)
                        hierarchies = []
                    else:
                        self.rng_state.shuffle(users)

                    # init for the sampling (by user).
                    user_i = 0
                    num_samples = []
                    user_data = {}

                    # sample by user.
                    while ctot_num_samples < num_new_samples:
                        if users_and_hiers is not None:
                            user, hier = users_and_hiers[user_i]
                            hierarchies.append(hier)
                        else:
                            user = users[user_i]

                        cdata = data["user_data"][user]
                        cnum_samples = len(data["user_data"][user]["y"])
                        if ctot_num_samples + cnum_samples > num_new_samples:
                            cnum_samples = num_new_samples - ctot_num_samples
                            indices = [i for i in range(cnum_samples)]
                            new_indices = self.rng_state.sample(
                                indices, cnum_samples)
                            x = []
                            y = []
                            for i in new_indices:
                                x.append(data["user_data"][user]["x"][i])
                                y.append(data["user_data"][user]["y"][i])
                            cdata = {"x": x, "y": y}

                        num_samples.append(cnum_samples)
                        user_data[user] = cdata

                        ctot_num_samples += cnum_samples
                        user_i += 1

                    if "hierarchies" in data:
                        users = [u for u, h in users_and_hiers][:user_i]
                    else:
                        users = users[:user_i]

                # create the .json file.
                all_data = {}
                all_data["users"] = users
                if hierarchies is not None:
                    all_data["hierarchies"] = hierarchies
                all_data["num_samples"] = num_samples
                all_data["user_data"] = user_data

                # save to json file.
                file_path = os.path.join(folder_name, f)
                print(f"\tsave sampled json file to the path={file_path}.")
                jump_json(all_data, file_path=file_path)

        print(f"save data to tgz file.")
        tar_compress_folder(folder_name)
        return folder_name