def _download(self, target_path): if self.subsets is None or len(self.subsets) == 0: to_download = SUBSETS.keys() else: to_download = self.subsets os.makedirs(target_path, exist_ok=True) for subset_name in to_download: if subset_name in SUBSETS.keys(): tmp_file = os.path.join(target_path, '{}.tar.gz'.format(subset_name)) download.download_file(SUBSETS[subset_name], tmp_file, num_threads=self.num_threads) download.extract_tar(tmp_file, target_path) extract_sub_path = os.path.join(target_path, 'LibriSpeech') for item in os.listdir(extract_sub_path): item_path = os.path.join(extract_sub_path, item) item_target_path = os.path.join(target_path, item) shutil.move(item_path, item_target_path) shutil.rmtree('extract_sub_path', ignore_errors=True) os.remove(tmp_file)
def _download(self, target_path): temp_path = os.path.join(target_path, 'temp') os.makedirs(temp_path, exist_ok=True) sentence_ark = os.path.join(temp_path, 'sentences.tar.bz2') sentence_list = os.path.join(temp_path, 'sentences.csv') audio_ark = os.path.join(temp_path, 'sentences_with_audio.tar.bz2') audio_list = os.path.join(temp_path, 'sentences_with_audio.csv') download.download_file(SENTENCE_LIST_URL, sentence_ark) download.download_file(AUDIO_LIST_URL, audio_ark) download.extract_tar(sentence_ark, temp_path) download.extract_tar(audio_ark, temp_path) audio_entries = self._load_audio_list(audio_list) sentences = self._load_sentence_list(sentence_list) valid_sentence_ids = set(audio_entries.keys()).intersection( set(sentences.keys())) # sent-id, username, lang, transcript all_records = [(k, audio_entries[k][0], sentences[k][0], sentences[k][1]) for k in valid_sentence_ids] meta_path = os.path.join(target_path, META_FILENAME) textfile.write_separated_lines(meta_path, all_records, separator='\t', sort_by_column=0) self._download_audio_files(all_records, target_path) shutil.rmtree(temp_path, ignore_errors=True)
def _download(self, target_path): os.makedirs(target_path, exist_ok=True) tmp_file = os.path.join(target_path, 'tmp_ark.zip') download.download_file(self.url, tmp_file) download.extract_zip(tmp_file, target_path) files.move_all_files_from_subfolders_to_top(target_path) os.remove(tmp_file)
def _download_audio_files(self, records, target_path): """ Download all audio files based on the given records. """ for record in logger.progress(records): audio_folder = os.path.join(target_path, 'audio', record[2]) audio_file = os.path.join(audio_folder, '{}.mp3'.format(record[0])) os.makedirs(audio_folder, exist_ok=True) download_url = 'https://audio.tatoeba.org/sentences/{}/{}.mp3'.format( record[2], record[0]) download.download_file(download_url, audio_file)
def _download(self, target_path): os.makedirs(target_path, exist_ok=True) for tag, download_url in DOWNLOAD_URLS.items(): if self.tags is None or tag in self.tags: tmp_file = os.path.join(target_path, 'tmp_{}.tgz'.format(tag)) download.download_file(download_url, tmp_file) download.extract_tar(tmp_file, target_path) os.remove(tmp_file)
def _download(self, target_path): os.makedirs(target_path, exist_ok=True) tmp_file = os.path.join(target_path, 'tmp_ark') download.download_file(self.url, tmp_file) self._extract_file(tmp_file, target_path) if self.move_files_up: files.move_all_files_from_subfolders_to_top(target_path, delete_subfolders=True) os.remove(tmp_file)
def _download(self, target_path): os.makedirs(target_path, exist_ok=True) tmp_file = os.path.join(target_path, 'tmp_ark.tar.gz') download.download_file(self.url, tmp_file) download.extract_tar(tmp_file, target_path) # We use copy since subfolders in the archive are read-only, hence throws permission error when trying to move. files.move_all_files_from_subfolders_to_top(target_path, delete_subfolders=True, copy=True) os.remove(tmp_file)
def test_download_file(sample_zip_data, tmpdir): dl_path = 'http://some.url/thezipfile.zip' target_path = os.path.join(tmpdir.strpath, 'target.zip') with requests_mock.Mocker() as mock: mock.get(dl_path, content=sample_zip_data) download.download_file(dl_path, target_path) assert os.path.isfile(target_path) with open(target_path, 'rb') as f: assert f.read() == sample_zip_data
def test_download_file(sample_zip_data, tmpdir): dl_path = 'http://some.url/thezipfile.zip' target_path = os.path.join(tmpdir.strpath, 'target.zip') with requests_mock.Mocker() as mock: # Return any size (doesn't matter, only for prints) mock.head(requests_mock.ANY, headers={'Content-Length': '100'}) mock.get(dl_path, content=sample_zip_data) download.download_file(dl_path, target_path) assert os.path.isfile(target_path) with open(target_path, 'rb') as f: assert f.read() == sample_zip_data