Ejemplo n.º 1
0
    def test_download_with_filter_license(self, sample_audio_list_tar_bz,
                                          sample_sentence_list_tar_bz,
                                          sample_audio_content, tmpdir):
        downloader = io.TatoebaDownloader(include_licenses=['CC BY-NC-ND 3.0'])

        with requests_mock.Mocker() as mock:
            mock.get(tatoeba.AUDIO_LIST_URL, content=sample_audio_list_tar_bz)
            mock.get(tatoeba.SENTENCE_LIST_URL,
                     content=sample_sentence_list_tar_bz)

            mock.get('https://audio.tatoeba.org/sentences/eng/141.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/fra/247.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/epo/1355.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/deu/6286.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/ita/6921520.mp3',
                     content=sample_audio_content)

            downloader.download(tmpdir.strpath)

            assert os.path.isfile(os.path.join(tmpdir.strpath, 'meta.txt'))

            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'eng', '141.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'fra', '247.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'epo', '1355.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'deu', '6286.mp3'))
            assert os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'ita', '6921520.mp3'))
Ejemplo n.º 2
0
    def test_load_audio_list_filter_license(self, sample_audio_list_path):
        downloader = io.TatoebaDownloader(include_licenses=['CC BY-NC 4.0'])
        entries = downloader._load_audio_list(sample_audio_list_path)

        assert len(entries) == 3

        assert entries['247'] == ['gretelen', 'CC BY-NC 4.0', None]
        assert entries['6286'] == ['Phoenix', 'CC BY-NC 4.0', None]
        assert entries['2952354'] == ['pencil', 'CC BY-NC 4.0', None]
Ejemplo n.º 3
0
    def test_load_audio_list(self, sample_audio_list_path):
        downloader = io.TatoebaDownloader()
        entries = downloader._load_audio_list(sample_audio_list_path)

        assert len(entries) == 5

        assert entries['247'] == ['gretelen', 'CC BY-NC 4.0', None]
        assert entries['1881'] == ['CK', 'CC BY-NC-ND 3.0', 'http://www.manythings.org/tatoeba']
        assert entries['6286'] == ['Phoenix', 'CC BY-NC 4.0', None]
        assert entries['2952354'] == ['pencil', 'CC BY-NC 4.0', None]
        assert entries['6921520'] == ['CK', 'CC BY-NC-ND 3.0', 'http://www.manythings.org/tatoeba']
Ejemplo n.º 4
0
    def test_load_sentence_list_filter_languages(self, sample_sentence_list_path):
        downloader = io.TatoebaDownloader(include_languages=['deu', 'eng'])
        entries = downloader._load_sentence_list(sample_sentence_list_path)

        assert len(entries) == 5

        assert entries['141'] == ['eng', 'I want you to tell me why you did that.']
        assert entries['511'] == ['deu', 'Wer will heiße Schokolade?']
        assert entries['524'] == ['deu', 'Das ist zu teuer!']
        assert entries['6286'] == ['deu', 'Ich denke, ich habe genug gehört.']
        assert entries['299609'] == ['eng', 'He washes his car at least once a week.']
Ejemplo n.º 5
0
    def test_load_audio_list_all(self, sample_audio_list_path):
        downloader = io.TatoebaDownloader(include_empty_licence=True)
        entries = downloader._load_audio_list(sample_audio_list_path)

        assert len(entries) == 7

        assert entries['141'] == ['BraveSentry', None, None]
        assert entries['247'] == ['gretelen', 'CC BY-NC 4.0', None]
        assert entries['1355'] == ['Nero', None, None]
        assert entries['1881'] == ['CK', 'CC BY-NC-ND 3.0', 'http://www.manythings.org/tatoeba']
        assert entries['6286'] == ['Phoenix', 'CC BY-NC 4.0', None]
        assert entries['2952354'] == ['pencil', 'CC BY-NC 4.0', None]
        assert entries['6921520'] == ['CK', 'CC BY-NC-ND 3.0', 'http://www.manythings.org/tatoeba']
Ejemplo n.º 6
0
    def test_load_sentence_list(self, sample_sentence_list_path):
        downloader = io.TatoebaDownloader()
        entries = downloader._load_sentence_list(sample_sentence_list_path)

        assert len(entries) == 8

        assert entries['141'] == ['eng', 'I want you to tell me why you did that.']
        assert entries['247'] == ['fra', 'Comment ça, je suis trop vieille pour ce poste ?']
        assert entries['511'] == ['deu', 'Wer will heiße Schokolade?']
        assert entries['524'] == ['deu', 'Das ist zu teuer!']
        assert entries['1355'] == ['epo', 'Mi panikis la homojn.']
        assert entries['6286'] == ['deu', 'Ich denke, ich habe genug gehört.']
        assert entries['299609'] == ['eng', 'He washes his car at least once a week.']
        assert entries['6921520'] == ['ita', 'Ho una zia che abita a Osaka.']
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description="Prepare data for training.")
    parser.add_argument("target_path", type=str)
    parser.add_argument("--tuda", action="store_true")
    parser.add_argument("--voxforge", action="store_true")
    parser.add_argument("--swc", action="store_true")
    parser.add_argument("--mailabs", action="store_true")
    parser.add_argument("--common_voice", action="store_true")
    parser.add_argument("--tatoeba", action="store_true")
    parser.add_argument("--zamia_speech", action="store_true")
    args = parser.parse_args()

    if args.tuda:
        print("Downloading tuda ...")
        dl = io.TudaDownloader()
        dl.download(os.path.join(args.target_path, "tuda"))

    if args.voxforge:
        print("Downloading voxforge ...")
        dl = io.VoxforgeDownloader(lang="de")
        dl.download(os.path.join(args.target_path, "voxforge"))

    if args.swc:
        print("Downloading swc ...")
        dl = io.SWCDownloader(lang="de")
        dl.download(os.path.join(args.target_path, "swc"))

    if args.mailabs:
        print("Downloading mailabs ...")
        dl = io.MailabsDownloader(tags=["de_DE"])
        dl.download(os.path.join(args.target_path, "mailabs"))

    if args.common_voice:
        print("Downloading common-voice ...")
        dl = io.CommonVoiceDownloader(lang="de")
        dl.download(os.path.join(args.target_path, "common_voice"))

    if args.tatoeba:
        print("Downloading tatoeba ...")
        dl = io.TatoebaDownloader(include_languages=["deu"])
        dl.download(os.path.join(args.target_path, "tatoeba"))

    if args.zamia_speech:
        print("Downloading zamia-speech ...")
        dl = io.ZamiaSpeechDownloader(lang="de")
        dl.download(os.path.join(args.target_path, "zamia_speech"))
Ejemplo n.º 8
0
    def test_download_with_filter_lang(self, sample_audio_list_tar_bz,
                                       sample_sentence_list_tar_bz,
                                       sample_audio_content, tmpdir):
        downloader = io.TatoebaDownloader(include_languages=['deu', 'eng'])

        with requests_mock.Mocker() as mock:
            # Return any size (doesn't matter, only for prints)
            mock.head(requests_mock.ANY, headers={'Content-Length': '100'})

            mock.get(tatoeba.AUDIO_LIST_URL, content=sample_audio_list_tar_bz)
            mock.get(tatoeba.SENTENCE_LIST_URL,
                     content=sample_sentence_list_tar_bz)

            mock.get('https://audio.tatoeba.org/sentences/eng/141.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/fra/247.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/epo/1355.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/deu/6286.mp3',
                     content=sample_audio_content)
            mock.get('https://audio.tatoeba.org/sentences/ita/6921520.mp3',
                     content=sample_audio_content)

            downloader.download(tmpdir.strpath)

            assert os.path.isfile(os.path.join(tmpdir.strpath, 'meta.txt'))

            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'eng', '141.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'fra', '247.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'epo', '1355.mp3'))
            assert os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'deu', '6286.mp3'))
            assert not os.path.isfile(
                os.path.join(tmpdir.strpath, 'audio', 'ita', '6921520.mp3'))