Example #1
0
    def test_download_data(self):
        cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
        config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))
        res_path = get_resource_file_path(
            'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip',
            cache_dir=cache_dir)
        res_path = get_resource_file_path(
            'https://cotk-data.s3-ap-northeast-1.amazonaws.com/test.zip',
            cache_dir=cache_dir)

        assert res_path == os.path.join(
            cache_dir,
            'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1')
        assert os.path.exists(res_path)

        meta_path = os.path.join(
            cache_dir,
            'f1043836933af4b8b28973d259c0c77f5049de2dff8d0d1f305c65f3c497b3b1.json'
        )
        assert os.path.exists(meta_path)
        with open(meta_path, 'r', encoding='utf-8') as meta_file:
            meta = json.load(meta_file)
            assert meta['local_path'] == res_path

        shutil.rmtree(cache_dir)
Example #2
0
    def test_download_resource(self):
        cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
        config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))
        res_path = get_resource_file_path('resources://test@amazon',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)
        res_path = get_resource_file_path('resources://test',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert res_path == os.path.join(
            cache_dir,
            '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08')
        assert os.path.exists(res_path)

        meta_path = os.path.join(
            cache_dir,
            '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08.json'
        )
        assert os.path.exists(meta_path)
        with open(meta_path, 'r', encoding='utf-8') as meta_file:
            meta = json.load(meta_file)
            assert meta['local_path'] == res_path

        shutil.rmtree(cache_dir)
Example #3
0
	def test_MSCOCO_resource(self):
		cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/file_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/mscoco.zip#MSCOCO')), cache_dir, config_dir)
		assert os.path.isdir(res_path)

		for key in ['train', 'test', 'dev']:
			assert os.path.isfile(os.path.join(res_path, key + '.txt'))
		shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/mscoco.zip_unzip')))
		if os.path.exists(cache_dir):
			shutil.rmtree(cache_dir)
Example #4
0
    def __init__(self):

        self.file_id = './tests/dataloader/dummy_languageprocessing'
        self.file_path = get_resource_file_path(self.file_id)

        all_vocab_list = ['<pad>', '<unk>', '<go>', '<eos>', \
                 'what', 'how', 'here', 'do', 'as', 'can', 'to']
        set_names = ["train", "dev", "test"]
        vocab = GeneralVocab.from_predefined(all_vocab_list, 8)
        toker = SimpleTokenizer('space', ['<pad>', '<unk>', '<go>', '<eos>'])
        sent = SentenceDefault(toker, vocab, convert_to_lower_letter=True)
        fields = {set_name: [('sent', sent)] for set_name in set_names}

        with FieldContext.set_parameters(vocab=GeneralVocab.from_predefined(
                all_vocab_list, 8),
                                         weak=True) as field_context:

            fieldcontents: Dict[str, OrderedDictType[str, _FieldContent]] = {}
            self.fields: "OrderedDict[str, OrderedDictType[str, Field]]" = {}
            if isinstance(fields, OrderedDict):
                fields = {
                    set_name: fields
                    for set_name in ["train", "dev", "test"]
                }
            if isinstance(fields, dict):
                for set_name, fields_in_one_set in fields.items():
                    one_fields, one_fieldcontents = self._fill_field_and_create_content(
                        set_name, fields_in_one_set)
                    self.fields[set_name] = one_fields
                    fieldcontents[set_name] = one_fieldcontents
            else:
                raise TypeError("Unknown type for fields")

            self._load_data(fieldcontents)

            self.vocabs = self._collect_vocabs_from_fields(self.fields)
            # self.default_vocab_id = 0 if len(self.vocabs) == 1 else None
            self.tokenizers = self._collect_tokenizers_from_fields(self.fields)
            # self.default_tokenizer_id = 0 if len(self.tokenizers) == 1 else None
            self.default_field_set_name: Optional[str] = None
            self.default_field_name: Optional[str] = None
            self._build_vocabs()

            self._setting_hash = self._create_setting_hash()
            self._vocab_hash = self._create_vocab_hash()
            self.data = self._get_data(fieldcontents)
            self._raw_data_hash, self._data_hash = self._create_data_hash(
                fieldcontents)
            self.index, self.batch_id, self.batch_size = self._init_batch(
                fieldcontents)

        self.set_default_field("train", "sent")
Example #5
0
	def test_SwitchboardCorpus_resource(self):
		cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/file_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/switchboard_corpus.zip#SwitchboardCorpus')), cache_dir, config_dir)
		assert os.path.isdir(res_path)

		for key in ['train', 'test', 'dev', 'multi_ref']:
			assert os.path.isfile(os.path.join(res_path, key + '.txt'))

		shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/switchboard_corpus.zip_unzip')))
		if os.path.exists(cache_dir):
			shutil.rmtree(cache_dir)
Example #6
0
	def test_glove50d_resource(self):
		cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
		config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))
		data_dir = str(pathlib.Path('./tests/file_utils/data'))
		res_path = get_resource_file_path(str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip#Glove50d')), cache_dir, config_dir)

		filenames = os.listdir(res_path)
		assert res_path == str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip_unzip/50d'))
		assert sorted(filenames) == sorted(os.listdir(os.path.join(data_dir, 'glove', '50d')))
		for filename in filenames:
			check(os.path.join(res_path, filename), os.path.join(data_dir, 'glove', '50d', filename))

		shutil.rmtree(str(pathlib.Path('./tests/file_utils/data/glove.6B.50d.zip_unzip')))
		if os.path.exists(cache_dir):
			shutil.rmtree(cache_dir)
Example #7
0
    def test_get_resource(self, r_mock):
        r_mock.get('http://coai.cs.tsinghua.edu.cn/', text='coai')

        cache_dir = str(pathlib.Path('./tests/file_utils/dataset_cache'))
        config_dir = str(pathlib.Path('./tests/file_utils/dummy_coai'))

        with pytest.raises(FileNotFoundError) as excinfo:
            get_resource_file_path('resources://coai',
                                   cache_dir=cache_dir,
                                   config_dir='wrongpath')
        assert "not found" in str(excinfo.value)

        with pytest.raises(RuntimeError) as excinfo:
            get_resource_file_path('resources://coai#wrongtype',
                                   cache_dir=cache_dir,
                                   config_dir=config_dir)
        assert "No resources type" in str(excinfo.value)

        with pytest.raises(ValueError) as excinfo:
            get_resource_file_path('resources://coai@wronglink',
                                   cache_dir=cache_dir,
                                   config_dir=config_dir)
        assert "source wronglink wrong" in str(excinfo.value)

        res_path = get_resource_file_path('resources://coai',
                                          cache_dir=cache_dir,
                                          config_dir=config_dir)

        assert res_path == os.path.join(
            cache_dir,
            '146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69')
        assert os.path.exists(res_path)

        hash_sha256 = hashlib.sha256()
        with open(res_path, "rb") as fin:
            for chunk in iter(lambda: fin.read(4096), b""):
                hash_sha256.update(chunk)
        assert hash_sha256.hexdigest(
        ) == "146ce545f2ed0a8767aadae8f2921f7951df817b39b8f7d0db48bce87e3eaf69"

        meta_path = res_path + '.json'
        assert os.path.exists(meta_path)
        with open(meta_path, 'r', encoding='utf-8') as meta_file:
            meta = json.load(meta_file)
            assert meta['local_path'] == res_path

        shutil.rmtree(cache_dir)