Exemple #1
0
    def load_data(self):
        """
        Fetch the MNIST dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        with gzip.open(filepath, 'rb') as f:
            self.train_set, self.valid_set = pickle_load(f)

        self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28),
                                    'axes': ('batch', 'height', 'width')},
                          'label': {'data': self.train_set[1],
                                    'axes': ('batch',)}}
        self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28),
                                    'axes': ('batch', 'height', 'width')},
                          'label': {'data': self.valid_set[1],
                                    'axes': ('batch',)}}

        return self.train_set, self.valid_set
Exemple #2
0
    def load_data(self):
        self.data_dict = {}
        self.vocab = None
        for phase in ['train', 'test', 'valid']:
            filename, filesize = self.filemap[phase]['filename'], self.filemap[phase]['size']
            workdir, filepath = valid_path_append(self.path, '', filename)
            if not os.path.exists(filepath):
                fetch_file(self.url, filename, filepath, filesize)

            tokens = open(filepath).read()  # add tokenization here if necessary

            if self.use_words:
                tokens = tokens.strip().split()

            self.vocab = sorted(set(tokens)) if self.vocab is None else self.vocab

            # vocab dicts
            self.token_to_index = dict((t, i) for i, t in enumerate(self.vocab))
            self.index_to_token = dict((i, t) for i, t in enumerate(self.vocab))

            # map tokens to indices
            X = np.asarray([self.token_to_index[t] for t in tokens], dtype=np.uint32)
            if self.shift_target:
                y = np.concatenate((X[1:], X[:1]))
            else:
                y = X.copy()

            self.data_dict[phase] = {'inp_txt': X, 'tgt_txt': y}

        return self.data_dict
Exemple #3
0
    def load_data(self, test_split=0.2):
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.filesize)

        with open(filepath, 'rb') as f:
            X, y = pickle_load(f)

        X = preprocess_text(X, self.vocab_size)
        X = pad_sentences(X,
                          pad_idx=self.pad_idx,
                          pad_to_len=self.sentence_length,
                          pad_from='left')

        if self.shuffle:
            indices = np.arange(len(y))
            np.random.shuffle(indices)
            X = X[indices]
            y = np.asarray(y)[indices]

        # split the data
        X_train = X[:int(len(X) * (1 - test_split))]
        y_train = y[:int(len(X) * (1 - test_split))]

        X_test = X[int(len(X) * (1 - test_split)):]
        y_test = y[int(len(X) * (1 - test_split)):]

        y_train = np.array(y_train)
        y_test = np.array(y_test)

        self.nclass = 1 + max(np.max(y_train), np.max(y_test))

        self.data_dict['train'] = {
            'review': {
                'data': X_train,
                'axes': ('batch', 'REC')
            },
            'label': {
                'data': y_train,
                'axes': ('batch', )
            }
        }
        self.data_dict['valid'] = {
            'review': {
                'data': X_test,
                'axes': ('batch', 'REC')
            },
            'label': {
                'data': y_test,
                'axes': ('batch', )
            }
        }
        return self.data_dict
    def load_data(self):
        self.data_dict = {}
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath)

        tokens = open(filepath).read()
        train_samples = int(self.train_split * len(tokens))
        train = tokens[:train_samples]
        test = tokens[train_samples:]

        return train, test
Exemple #5
0
    def load_data(self):
        """
        Fetch the CIFAR-10 dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.
            normalize (bool, optional): Whether to scale values between 0 and 1.
                                        Defaults to True.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        batchdir = os.path.join(workdir, 'cifar-10-batches-py')
        if not os.path.exists(os.path.join(batchdir, 'data_batch_1')):
            assert os.path.exists(filepath), "Must have cifar-10-python.tar.gz"
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(workdir)

        train_batches = [os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6)]
        Xlist, ylist = [], []
        for batch in train_batches:
            with open(batch, 'rb') as f:
                d = pickle_load(f)
                Xlist.append(d['data'])
                ylist.append(d['labels'])

        X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32)
        y_train = np.vstack(ylist).ravel()

        with open(os.path.join(batchdir, 'test_batch'), 'rb') as f:
            d = pickle_load(f)
            X_test, y_test = d['data'], d['labels']
            X_test = X_test.reshape(-1, 3, 32, 32)

        self.train_set = {'image': {'data': X_train,
                                    'axes': ('batch', 'channel', 'height', 'width')},
                          'label': {'data': y_train,
                                    'axes': ('batch',)}}
        self.valid_set = {'image': {'data': X_test,
                                    'axes': ('batch', 'channel', 'height', 'width')},
                          'label': {'data': np.array(y_test),
                                    'axes': ('batch',)}}

        return self.train_set, self.valid_set
Exemple #6
0
    def load_data(self, path=".", subset='wiki_entities'):
        """
        Fetch the Facebook WikiMovies dataset and load it to memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: knowledge base, entity list, training and test files are returned
        """
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(path, '', self.filename)
        babi_dir_name = self.filename.split('.')[0]

        if subset == 'wiki-entities':
            subset_folder = 'wiki_entities'
        else:
            subset_folder = subset

        file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt'
        train_file = os.path.join(workdir, file_base.format('train'))
        test_file = os.path.join(workdir, file_base.format('test'))

        entity_file_path = babi_dir_name + '/knowledge_source/entities.txt'
        entity_file = os.path.join(workdir, entity_file_path)

        # Check for the existence of the entity file
        # If it isn't there then we know we need to fetch everything
        if not os.path.exists(entity_file):
            if license_prompt('WikiMovies',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \
            + subset_folder + '_kb.txt'
        kb_file = os.path.join(workdir, knowledge_file_path)

        return entity_file, kb_file, train_file, test_file
Exemple #7
0
    def load_data(self, path=".", subset='wiki_entities'):
        """
        Fetch the Facebook WikiMovies dataset and load it to memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: knowledge base, entity list, training and test files are returned
        """
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(path, '', self.filename)
        babi_dir_name = self.filename.split('.')[0]

        if subset == 'wiki-entities':
            subset_folder = 'wiki_entities'
        else:
            subset_folder = subset

        file_base = babi_dir_name + '/questions/' + subset_folder + '/' + subset + '_qa_{}.txt'
        train_file = os.path.join(workdir, file_base.format('train'))
        test_file = os.path.join(workdir, file_base.format('test'))

        entity_file_path = babi_dir_name + '/knowledge_source/entities.txt'
        entity_file = os.path.join(workdir, entity_file_path)

        # Check for the existence of the entity file
        # If it isn't there then we know we need to fetch everything
        if not os.path.exists(entity_file):
            if license_prompt('WikiMovies',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        knowledge_file_path = babi_dir_name + '/knowledge_source/' + subset_folder + '/' \
            + subset_folder + '_kb.txt'
        kb_file = os.path.join(workdir, knowledge_file_path)

        return entity_file, kb_file, train_file, test_file
Exemple #8
0
    def load_data(self, data_directory=None, manifest_file=None):
        """
        Create a manifest file for the requested dataset. First downloads the
        dataset and extracts it, if necessary.

        Arguments:
            data_directory (str): Path to data directory. Defaults to <path>/<version>
            manifest_file (str): Path to manifest file. Defaults to <data_directory>/manifest.tsv

        Returns:
            Path to manifest file
        """

        if manifest_file is None:
            if self.manifest_file is not None:
                manifest_file = self.manifest_file
            else:
                manifest_file = os.path.join(self.path, "manifest.tsv")

        if os.path.exists(manifest_file):
            return manifest_file

        # Download the file
        workdir, filepath = valid_path_append(self.path, '', self.source_file)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.source_file, filepath)

        # Untar the file
        if data_directory is None:
            data_directory = os.path.join(self.path, self.version)
        if not os.path.exists(data_directory):
            print("Extracting tar file to {}".format(data_directory))
            with contextlib.closing(tarfile.open(filepath)) as tf:
                tf.extractall(data_directory)

        # Ingest the file
        ingest_librispeech(data_directory, manifest_file)

        return manifest_file
Exemple #9
0
    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = 'dialog-babi-candidates.txt'
            self.kb_filename = 'dialog-babi-kb-all.txt'
            self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format(self.task + 1)
        else:
            self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt'
            self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt'
            self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task + 1)

        self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task + 1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(
            self.path, '', self.filename)
        if not os.path.exists(filepath):
            if license_prompt('bAbI-dialog',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        self.babi_dir_name = self.filename.split('.')[0]

        self.candidate_answer_filename = self.babi_dir_name + \
            '/' + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + '/' + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename

        task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt'

        train_file = os.path.join(self.workdir, task_name.format('trn'))
        dev_file = os.path.join(self.workdir, task_name.format('dev'))
        test_file_postfix = 'tst-OOV' if self.oov else 'tst'
        test_file = os.path.join(
            self.workdir,
            task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
            or os.path.exists(dev_file) is False
            or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file
Exemple #10
0
    def load_data(self):
        """
        Fetch and extract the Facebook bAbI-dialog dataset if not already downloaded.

        Returns:
            tuple: training and test filenames are returned
        """
        if self.task < 5:
            self.candidate_answer_filename = 'dialog-babi-candidates.txt'
            self.kb_filename = 'dialog-babi-kb-all.txt'
            self.cands_mat_filename = 'babi-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dialog-babi-vocab-task{}.pkl'.format(
                self.task + 1)
        else:
            self.candidate_answer_filename = 'dialog-babi-task6-dstc2-candidates.txt'
            self.kb_filename = 'dialog-babi-task6-dstc2-kb.txt'
            self.cands_mat_filename = 'dstc2-cands-with-matchtype_{}.npy'
            self.vocab_filename = 'dstc2-vocab-task{}.pkl'.format(self.task +
                                                                  1)

        self.vectorized_filename = 'vectorized_task{}.pkl'.format(self.task +
                                                                  1)

        self.data_dict = {}
        self.vocab = None
        self.workdir, filepath = valid_path_append(self.path, '',
                                                   self.filename)
        if not os.path.exists(filepath):
            if license_prompt('bAbI-dialog',
                              'https://research.fb.com/downloads/babi/',
                              self.path) is False:
                sys.exit(0)

            fetch_file(self.url, self.filename, filepath, self.size)

        self.babi_dir_name = self.filename.split('.')[0]

        self.candidate_answer_filename = self.babi_dir_name + \
            '/' + self.candidate_answer_filename
        self.kb_filename = self.babi_dir_name + '/' + self.kb_filename
        self.cands_mat_filename = os.path.join(
            self.workdir, self.babi_dir_name + '/' + self.cands_mat_filename)
        self.vocab_filename = self.babi_dir_name + '/' + self.vocab_filename
        self.vectorized_filename = self.babi_dir_name + '/' + self.vectorized_filename

        task_name = self.babi_dir_name + '/' + self.tasks[self.task] + '{}.txt'

        train_file = os.path.join(self.workdir, task_name.format('trn'))
        dev_file = os.path.join(self.workdir, task_name.format('dev'))
        test_file_postfix = 'tst-OOV' if self.oov else 'tst'
        test_file = os.path.join(self.workdir,
                                 task_name.format(test_file_postfix))

        cand_file = os.path.join(self.workdir, self.candidate_answer_filename)
        kb_file = os.path.join(self.workdir, self.kb_filename)
        vocab_file = os.path.join(self.workdir, self.vocab_filename)
        vectorized_file = os.path.join(self.workdir, self.vectorized_filename)

        if (os.path.exists(train_file) is False
                or os.path.exists(dev_file) is False
                or os.path.exists(test_file) is False
                or os.path.exists(cand_file) is False):
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(self.workdir)

        return train_file, dev_file, test_file, cand_file, kb_file, vocab_file, vectorized_file