Exemple #1
0
    def load_data(self):
        """
        Fetch the MNIST dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        with gzip.open(filepath, 'rb') as f:
            self.train_set, self.valid_set = pickle_load(f)

        self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28),
                                    'axes': ('N', 'H', 'W')},
                          'label': {'data': self.train_set[1],
                                    'axes': ('N',)}}
        self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28),
                                    'axes': ('N', 'H', 'W')},
                          'label': {'data': self.valid_set[1],
                                    'axes': ('N',)}}

        return self.train_set, self.valid_set
Exemple #2
0
 def download_lsun(self, category, dset, tag='latest', overwrite=False):
     """
     Download LSUN data and unpack
     Arguments:
         category (str): LSUN category (valid selections: lsun_categories)
         dset (str): dataset, "train", "val", or "test"
         tag (str, optional): version tag, defaults to most recent
         overwrite (bool): whether to overwrite existing data
     """
     dfile = 'test_lmdb' if dset == 'test' else '{0}_{1}_lmdb'.format(
         category, dset)
     self.filepath = filepath = valid_path_append(self.path, dfile)
     if not os.path.exists(filepath) or overwrite:
         filepath += '.zip'
         if not os.path.exists(filepath):
             url = LSUN.url + \
                 'download.cgi?tag={0}&category={1}&set={2}'.format(tag, category, dset)
             print('Data download might take a long time.')
             print('Downloading {0} {1} set...'.format(category, dset))
             subprocess.call(['curl', url, '-o', filepath])
             # TODO
             # should change to fetch_file,
             # but currently did not get the correct "Content-length" or total_size
             # fetch_file(url, 'bedroom_train_lmdb.zip', filepath)
         print('Extracting {0} {1} set...'.format(category, dset))
         zf = zipfile.ZipFile(filepath, 'r')
         zf.extractall(self.path)
         zf.close()
         print('Deleting {} ...'.format(filepath))
         os.remove(filepath)
     else:
         pass  # data already downloaded
     print("LSUN {0} {1} dataset downloaded and unpacked.".format(
         category, dset))
    def load_data(self):
        self.data_dict = {}
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath)

        tokens = open(filepath).read()
        train_samples = int(self.train_split * len(tokens))
        train = tokens[:train_samples]
        test = tokens[train_samples:]

        return train, test
Exemple #4
0
    def load_data(self):
        """
        Fetch the CIFAR-100 dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.
            normalize (bool, optional): Whether to scale values between 0 and 1.
                                        Defaults to True.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        batchdir = os.path.join(workdir, 'cifar-100-python')
        if not os.path.exists(os.path.join(batchdir)):
            assert os.path.exists(filepath), "Must have cifar-100-python.tar.gz"
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(workdir)

        train_batches = [os.path.join(batchdir, 'train')]
        Xlist, ylist = [], []
        for batch in train_batches:
            with open(batch, 'rb') as f:
                train_dict = pickle_load(f)
                Xlist.append(train_dict['data'])
                ylist.append(train_dict['coarse_labels'])

        X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32)
        y_train = np.vstack(ylist).ravel()

        with open(os.path.join(batchdir, 'test'), 'rb') as f:
            test_dict = pickle_load(f)
            X_test, y_test = test_dict['data'], test_dict['coarse_labels']
            X_test = X_test.reshape(-1, 3, 32, 32)

        self.train_set = {'image': {'data': X_train,
                                    'axes': ('N', 'C', 'H', 'W')},
                          'label': {'data': y_train,
                                    'axes': ('N',)}}
        self.valid_set = {'image': {'data': X_test,
                                    'axes': ('N', 'C', 'H', 'W')},
                          'label': {'data': np.array(y_test),
                                    'axes': ('N',)}}

        return self.train_set, self.valid_set
Exemple #5
0
    def load_data(self, test_split=0.2):
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.filesize)

        with open(filepath, 'rb') as f:
            X, y = pickle_load(f)

        X = preprocess_text(X, self.vocab_size)
        X = pad_sentences(
            X, pad_idx=self.pad_idx, pad_to_len=self.sentence_length, pad_from='left')

        if self.shuffle:
            indices = np.arange(len(y))
            np.random.shuffle(indices)
            X = X[indices]
            y = np.asarray(y)[indices]

        # split the data
        X_train = X[:int(len(X) * (1 - test_split))]
        y_train = y[:int(len(X) * (1 - test_split))]

        X_test = X[int(len(X) * (1 - test_split)):]
        y_test = y[int(len(X) * (1 - test_split)):]

        y_train = np.array(y_train)
        y_test = np.array(y_test)

        self.nclass = 1 + max(np.max(y_train), np.max(y_test))

        self.data_dict['train'] = {'review': {'data': X_train,
                                              'axes': ('N', 'REC')},
                                   'label': {'data': y_train,
                                             'axes': ('N',)}}
        self.data_dict['valid'] = {'review': {'data': X_test,
                                              'axes': ('N', 'REC')},
                                   'label': {'data': y_test,
                                             'axes': ('N',)}}
        return self.data_dict
Exemple #6
0
    def load_data(self):
        self.data_dict = {}
        for phase in ['train', 'test']:
            filename = self.filemap[phase]['filename']
            workdir, filepath = valid_path_append(self.path, '', filename)
            if not os.path.exists(filepath):
                for file_name, file_id in GOOGLE_DRIVE_IDS.items():
                    destination = './' + file_name
                    print(
                        '\nDownloading and unzipping traveling salesman data {} released '
                        'with Pointer Networks paper\n'.format(file_name))
                    self.download_file_from_google_drive(file_id, destination)
                    with zipfile.ZipFile(destination, 'r') as z:
                        z.extractall('./')

            cities = int(re.search(r'\d+', filename).group())
            print('Loading and preprocessing tsp{} {} data...'.format(
                cities, phase))
            with open(filepath, 'r') as f:
                X, y, y_teacher = [], [], []
                for i, line in tqdm(enumerate(f)):
                    inputs, outputs = line.split('output')
                    X.append(
                        np.array([float(j)
                                  for j in inputs.split()]).reshape([-1, 2]))
                    y.append(
                        np.array([int(j) - 1 for j in outputs.split()
                                  ])[:-1])  # delete last
                    # teacher forcing array as decoder's input while training
                    y_teacher.append([X[i][j - 1] for j in y[i]])

            X = np.array(X)
            y = np.array(y)
            y_teacher = np.array(y_teacher)
            self.data_dict[phase] = {
                'inp_txt': X,
                'tgt_txt': y,
                'teacher_tgt': y_teacher
            }

        return self.data_dict
Exemple #7
0
    def load_data(self, data_directory=None, manifest_file=None):
        """
        Create a manifest file for the requested dataset. First downloads the
        dataset and extracts it, if necessary.

        Arguments:
            data_directory (str): Path to data directory. Defaults to <path>/<version>
            manifest_file (str): Path to manifest file. Defaults to <data_directory>/manifest.tsv

        Returns:
            Path to manifest file
        """

        if manifest_file is None:
            if self.manifest_file is not None:
                manifest_file = self.manifest_file
            else:
                manifest_file = os.path.join(self.path, "manifest.tsv")

        if os.path.exists(manifest_file):
            return manifest_file

        # Download the file
        workdir, filepath = valid_path_append(self.path, '', self.source_file)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.source_file, filepath)

        # Untar the file
        if data_directory is None:
            data_directory = os.path.join(self.path, self.version)
        if not os.path.exists(data_directory):
            print("Extracting tar file to {}".format(data_directory))
            with contextlib.closing(tarfile.open(filepath)) as tf:
                tf.extractall(data_directory)

        # Ingest the file
        ingest_librispeech(data_directory, manifest_file)

        return manifest_file
Exemple #8
0
    def load_data(self):
        self.data_dict = {}
        self.vocab = None
        for phase in ['train', 'test', 'valid']:
            filename, filesize = self.filemap[phase]['filename'], self.filemap[
                phase]['size']
            workdir, filepath = valid_path_append(self.path, '', filename)
            if not os.path.exists(filepath):
                fetch_file(self.url, filename, filepath, filesize)

            tokens = open(
                filepath).read()  # add tokenization here if necessary

            if self.use_words:
                tokens = tokens.strip().split()

            self.vocab = sorted(
                set(tokens)) if self.vocab is None else self.vocab

            # vocab dicts
            self.token_to_index = dict(
                (t, i) for i, t in enumerate(self.vocab))
            self.index_to_token = dict(
                (i, t) for i, t in enumerate(self.vocab))

            # map tokens to indices
            X = np.asarray([self.token_to_index[t] for t in tokens],
                           dtype=np.uint32)
            if self.shift_target:
                y = np.concatenate((X[1:], X[:1]))
            else:
                y = X.copy()

            self.data_dict[phase] = {'inp_txt': X, 'tgt_txt': y}

        return self.data_dict