Beispiel #1
0
    def __init__(self, begin_line=0, num_lines=6348957, root='.data'):
        """Initiate EnWik9 dataset.

        Arguments:
            begin_line: the number of beginning line. Default: 0
            num_lines: the number of lines to be loaded. Default: 6348957
            root: Directory where the datasets are saved. Default: ".data"
            data: a list of label/tokens tuple. tokens are a tensor after

        Examples:
            >>> from torchtext.datasets import EnWik9
            >>> enwik9 = EnWik9(num_lines=20000)
            >>> vocab = enwik9.get_vocab()
        """

        super(EnWik9, self).__init__()

        processed_file = os.path.join(root, 'norm_enwik9')
        if not os.path.exists(processed_file):
            url = 'http://mattmahoney.net/dc/enwik9.zip'
            dataset_zip = download_from_url(url,
                                            path=os.path.join(
                                                root, 'enwik9.zip'),
                                            root=root)
            extracted_file = extract_archive(dataset_zip)
            raw_file = extracted_file[0]
            preprocess_raw_enwik9(raw_file, processed_file)

        # Meta information
        offsets = generate_offsets(processed_file)
        read_lines = read_lines_from_iterator(processed_file, offsets,
                                              begin_line, num_lines)

        self._data = []
        for item in simple_space_split(read_lines):
            self._data += item

        self._vocab = None
Beispiel #2
0
 def test_simple_space_split(self):
     test_sample = ['test simple space split function']
     ref_results = ['test', 'simple', 'space', 'split', 'function']
     self.assertEqual(list(simple_space_split(test_sample))[0], ref_results)