Ejemplo n.º 1
0
    def test_download_resource(self):
        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        res_path = get_resource_file_path('resources://test@amazon',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)
        res_path = get_resource_file_path('resources://test',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert (res_path == os.path.join(
            cache_dir,
            '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08')
                )
        assert (os.path.exists(res_path))

        meta_path = os.path.join(
            cache_dir,
            '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08.json'
        )
        assert (os.path.exists(meta_path))
        with open(meta_path, 'r') as meta_file:
            meta = json.load(meta_file)
            assert (meta['local_path'] == res_path)

        shutil.rmtree(cache_dir)
Ejemplo n.º 2
0
    def test_download_data(self):
        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        res_path = get_resource_file_path(
            'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip',
            cache_dir=cache_dir)
        res_path = get_resource_file_path(
            'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip',
            cache_dir=cache_dir)

        assert (res_path == os.path.join(
            cache_dir,
            'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1')
                )
        assert (os.path.exists(res_path))

        meta_path = os.path.join(
            cache_dir,
            'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1.json'
        )
        assert (os.path.exists(meta_path))
        with open(meta_path, 'r') as meta_file:
            meta = json.load(meta_file)
            assert (meta['local_path'] == res_path)

        shutil.rmtree(cache_dir)
Ejemplo n.º 3
0
    def test_SwitchboardCorpus_resource(self):
        cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache'))
        config_dir = str(pathlib.Path('./tests/_utils/dummy_coai'))
        data_dir = str(pathlib.Path('./tests/_utils/data'))
        res_path = get_resource_file_path(
            str(
                pathlib.Path(
                    './tests/_utils/data/switchboard_corpus.zip#SwitchboardCorpus'
                )))

        filenames = os.listdir(res_path)
        assert res_path == str(
            pathlib.Path(
                './tests/_utils/data/switchboard_corpus.zip_unzip/switchboard_corpus'
            ))
        assert sorted(filenames) == sorted(
            os.listdir(os.path.join(data_dir, 'switchboard_corpus')))
        for filename in filenames:
            check(os.path.join(res_path, filename),
                  os.path.join(data_dir, 'switchboard_corpus', filename))

        shutil.rmtree(
            str(
                pathlib.Path(
                    './tests/_utils/data/switchboard_corpus.zip_unzip')))
Ejemplo n.º 4
0
    def test_download_resource(self):
        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        res_path = get_resource_file_path('resources://MSCOCO',
                                          'MSCOCO',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert (res_path == os.path.join(
            cache_dir,
            'f2c79c204e083627ea6c166061b45ba536813058caf178d21ca58daf5abe8a01_unzip/mscoco'
        ))
        assert (os.path.exists(res_path))

        assert (
            dirhash(res_path, 'sha256') ==
            'f8ece190272864935f1849d784cb67d36b970c54aceadbcd7e845bdeefc23544')

        meta_path = os.path.join(
            cache_dir,
            'f2c79c204e083627ea6c166061b45ba536813058caf178d21ca58daf5abe8a01.json'
        )
        assert (os.path.exists(meta_path))
        with open(meta_path, 'r') as meta_file:
            meta = json.load(meta_file)
            assert (meta == {'local_path': res_path})
Ejemplo n.º 5
0
    def test_get_resource(self, r_mock):
        r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai')

        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        res_path = get_resource_file_path('resources://coai',
                                          'Default',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert (res_path == os.path.join(
            cache_dir,
            '6bd9bfb20a5159d1848a203ece33886690b15d785b0c5d632eed63d70442c58b')
                )
        assert (os.path.exists(res_path))

        hash_sha256 = hashlib.sha256()
        with open(res_path, "rb") as fin:
            for chunk in iter(lambda: fin.read(4096), b""):
                hash_sha256.update(chunk)
        assert (
            hash_sha256.hexdigest() ==
            "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69")

        meta_path = res_path + '.json'
        assert (os.path.exists(meta_path))
        with open(meta_path, 'r') as meta_file:
            meta = json.load(meta_file)
            assert (meta == {'local_path': res_path})
Ejemplo n.º 6
0
 def __init__(self, file_id="../data/film", min_vocab_times=0,
              max_sent_length=10086, invalid_vocab_times=0):
     self._file_id = file_id
     self._file_path = get_resource_file_path(file_id)
     self._min_vocab_times = min_vocab_times
     self._max_sent_length = max_sent_length
     self._invalid_vocab_times = invalid_vocab_times
     super(MyLM, self).__init__()
Ejemplo n.º 7
0
	def test_MSCOCO_resource(self):
		cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/mscoco.zip#MSCOCO')))
		assert os.path.isdir(res_path)

		for key in ['train', 'test', 'dev']:
			assert os.path.isfile(os.path.join(res_path, key + '.txt'))
		shutil.rmtree(str(pathlib.Path('./tests/_utils/data/mscoco.zip_unzip')))
Ejemplo n.º 8
0
 def __init__(self, file_id="../data/film", min_vocab_times=0, max_sent_length=10086, invalid_vocab_times=0, num_turns=8,
              max_know_length=100):
     self._file_id = file_id
     self._file_path = get_resource_file_path(file_id)
     self._min_vocab_times = min_vocab_times
     self._max_sent_length = max_sent_length
     self._invalid_vocab_times = invalid_vocab_times
     self._num_turns = num_turns
     self._max_know_length = max_know_length
     super(MyMemSeq2Seq, self).__init__()
Ejemplo n.º 9
0
	def test_SwitchboardCorpus_resource(self):
		cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/switchboard_corpus.zip#SwitchboardCorpus')))
		assert os.path.isdir(res_path)

		for key in ['train', 'test', 'dev', 'multi_ref']:
			assert os.path.isfile(os.path.join(res_path, key + '.txt'))

		shutil.rmtree(str(pathlib.Path('./tests/_utils/data/switchboard_corpus.zip_unzip')))
Ejemplo n.º 10
0
	def test_glove50d_resource(self):
		cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip#Glove50d')))

		filenames = os.listdir(res_path)
		assert res_path == str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip_unzip/50d'))
		assert sorted(filenames) == sorted(os.listdir(os.path.join(data_dir, 'glove', '50d')))
		for filename in filenames:
			check(os.path.join(res_path, filename), os.path.join(data_dir, 'glove', '50d', filename))

		shutil.rmtree(str(pathlib.Path('./tests/_utils/data/glove.6B.50d.zip_unzip')))
Ejemplo n.º 11
0
    def test_MSCOCO_resource(self):
        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        data_dir = './tests/_utils/data'
        res_path = get_resource_file_path(
            './tests/_utils/data/mscoco.zip#MSCOCO')

        filenames = os.listdir(res_path)
        assert (res_path == './tests/_utils/data/mscoco.zip_unzip/mscoco')
        assert (sorted(filenames) == sorted(
            os.listdir(os.path.join(data_dir, 'mscoco'))))
        for filename in filenames:
            check(os.path.join(res_path, filename),
                  os.path.join(data_dir, 'mscoco', filename))

        shutil.rmtree('./tests/_utils/data/mscoco.zip_unzip')
Ejemplo n.º 12
0
    def test_get_resource(self, r_mock):
        r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai')

        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'

        with pytest.raises(FileNotFoundError) as excinfo:
            get_resource_file_path('resources://coai',
                                   cache_dir=cache_dir,
                                   config_dir='wrongpath')
        assert ("not found" in str(excinfo.value))

        with pytest.raises(ValueError) as excinfo:
            get_resource_file_path('resources://coai#wrongtype',
                                   cache_dir=cache_dir,
                                   config_dir=config_dir)
        assert ("differs with res_type" in str(excinfo.value))

        with pytest.raises(ValueError) as excinfo:
            get_resource_file_path('resources://coai@wronglink',
                                   cache_dir=cache_dir,
                                   config_dir=config_dir)
        assert ("source wronglink wrong" in str(excinfo.value))

        res_path = get_resource_file_path('resources://coai',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert (res_path == os.path.join(
            cache_dir,
            '146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69')
                )
        assert (os.path.exists(res_path))

        hash_sha256 = hashlib.sha256()
        with open(res_path, "rb") as fin:
            for chunk in iter(lambda: fin.read(4096), b""):
                hash_sha256.update(chunk)
        assert (
            hash_sha256.hexdigest() ==
            "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69")

        meta_path = res_path + '.json'
        assert (os.path.exists(meta_path))
        with open(meta_path, 'r') as meta_file:
            meta = json.load(meta_file)
            assert (meta['local_path'] == res_path)

        shutil.rmtree(cache_dir)
Ejemplo n.º 13
0
    def test_OpenSubtitles_resource(self):
        cache_dir = './tests/_utils/dataset_cache'
        config_dir = './tests/_utils/dummy_coai'
        data_dir = './tests/_utils/data'
        res_path = get_resource_file_path(
            './tests/_utils/data/opensubtitles.zip#OpenSubtitles')

        filenames = os.listdir(res_path)
        assert (res_path ==
                './tests/_utils/data/opensubtitles.zip_unzip/opensubtitles')
        assert (sorted(filenames) == sorted(
            os.listdir(os.path.join(data_dir, 'opensubtitles'))))
        for filename in filenames:
            check(os.path.join(res_path, filename),
                  os.path.join(data_dir, 'opensubtitles', filename))

        shutil.rmtree('./tests/_utils/data/opensubtitles.zip_unzip')
Ejemplo n.º 14
0
    def test_Ubuntu_resource(self):
        cache_dir = str(pathlib.Path('./tests/_utils/dataset_cache'))
        config_dir = str(pathlib.Path('./tests/_utils/dummy_coai'))
        data_dir = str(pathlib.Path('./tests/_utils/data'))
        res_path = get_resource_file_path(
            str(pathlib.Path('./tests/_utils/data/ubuntu_dataset.zip#Ubuntu')))

        filenames = os.listdir(res_path)
        assert res_path == str(
            pathlib.Path(
                './tests/_utils/data/ubuntu_dataset.zip_unzip/ubuntu_dataset'))
        assert sorted(filenames) == sorted(
            os.listdir(os.path.join(data_dir, 'ubuntu_dataset')))
        for filename in filenames:
            check(os.path.join(res_path, filename),
                  os.path.join(data_dir, 'ubuntu_dataset', filename))

        shutil.rmtree(
            str(pathlib.Path('./tests/_utils/data/ubuntu_dataset.zip_unzip')))
Ejemplo n.º 15
0
    def __init__(self,
                 file_id,
                 min_vocab_times=10,
                 max_sent_length=50,
                 invalid_vocab_times=0,
                 num_samples=10,
                 raml_file="samples_iwslt14.txt",
                 tau=0.4,
                 raml=True):
        self._file_id = file_id
        self._file_path = get_resource_file_path(file_id)
        self._min_vocab_times = min_vocab_times
        self._max_sent_length = max_sent_length
        self._invalid_vocab_times = invalid_vocab_times

        # RAML specific
        self.raml_mode = raml
        self.n_samples = num_samples
        self.raml_path = os.path.join(self._file_path, raml_file)
        self.tau = tau
        self.raml_path = os.path.join(self._file_path, raml_file)
        super(IWSLT14, self).__init__(file_id=file_id)
        self.raml_data = self.read_raml_sample_file()