def main():
    target_dir = args.target_dir
    target_dir = '/media/data/Soroush_data/CVSpeech' + target_dir
    output_final_dir = '/media/data/Soroush_data/CVSpeech/data/'
    print(target_dir)
    os.makedirs(target_dir, exist_ok=True)

    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
    os.makedirs(target_unpacked_dir, exist_ok=True)

    if args.tar_path and os.path.exists(args.tar_path):
        print('Find existing file {}'.format(args.tar_path))
        target_file = os.getcwd() + '/PycharmProjects/deepspeech.pytorch/' + args.tar_path
    else:
        print("Could not find downloaded Common Voice archive, Downloading corpus...")
        print(target_dir)
        filename = wget.download(COMMON_VOICE_URL, target_dir)
        target_file = os.path.join(target_dir, os.path.basename(filename))

    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
    tar = tarfile.open(target_file)
    tar.extractall(target_unpacked_dir)
    tar.close()

    for csv_file in args.files_to_process.split(','):
        print(csv_file)
        convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file),
                       os.path.join(target_dir, os.path.splitext(csv_file)[0]))

    print('Creating manifests...')
    for csv_file in args.files_to_process.split(','):
        create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
                        os.path.join(output_final_dir, os.path.splitext(csv_file)[0]) + '_manifest.csv',
                        args.min_duration,
                        args.max_duration)
Esempio n. 2
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)
    files_to_dl = args.files_to_use.strip().split(',')
    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
        split_dir = os.path.join(target_dl_dir, split_type)
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_wav_dir = os.path.join(split_dir, "wav")
        if not os.path.exists(split_wav_dir):
            os.makedirs(split_wav_dir)
        split_txt_dir = os.path.join(split_dir, "txt")
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir)
        for url in lst_libri_urls:
            # check if we want to dl this file
            dl_flag = False
            for f in files_to_dl:
                if url.find(f) != -1:
                    dl_flag = True
            if not dl_flag:
                print("Skipping url: {}".format(url))
                continue
            filename = url.split("/")[-1]
            target_filename = os.path.join(split_dir, filename)
            if not os.path.exists(target_filename):
                wget.download(url, split_dir)
            print("Unpacking {}...".format(filename))
            tar = tarfile.open(target_filename)
            tar.extractall(split_dir)
            tar.close()
            os.remove(target_filename)
            print("Converting flac files to wav and extracting transcripts...")
            assert os.path.exists(
                extracted_dir
            ), "Archive {} was not properly uncompressed.".format(filename)
            for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                for f in files:
                    if f.find(".flac") != -1:
                        _process_file(wav_dir=split_wav_dir,
                                      txt_dir=split_txt_dir,
                                      base_filename=f,
                                      root_dir=root)

            print("Finished {}".format(url))
            shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv',
                            args.min_duration, args.max_duration)
        else:
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
Esempio n. 3
0
def main():
    root_path = 'an4/'
    name = 'an4'
    subprocess.call(['wget http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'], shell=True)
    subprocess.call(['tar -xzvf an4_raw.bigendian.tar.gz'], stdout=open(os.devnull, 'wb'), shell=True)
    os.makedirs(args.an4_path)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.an4_path + '/train/'
    test_path = args.an4_path + '/test/'
    print ('Creating manifests...')
    create_manifest(train_path, 'train')
    create_manifest(test_path, 'val')
Esempio n. 4
0
def main():
    root_path = 'an4/'
    name = 'an4'
    wget.download(
        'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'
    print('\n', 'Creating manifests...')
    create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration,
                    args.max_duration)
    create_manifest(test_path, 'an4_val_manifest.csv')
Esempio n. 5
0
def main():
    target_dl_dir = args.target_dir
    if not os.path.exists(target_dl_dir):
        os.makedirs(target_dl_dir)

    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2")
    if args.tar_path and os.path.exists(args.tar_path):
        target_file = args.tar_path
    else:
        print(
            "Could not find downloaded TEDLIUM archive, Downloading corpus...")
        wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
        target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz")

    if not os.path.exists(target_unpacked_dir):
        print("Unpacking corpus...")
        tar = tarfile.open(target_file)
        tar.extractall(target_dl_dir)
        tar.close()
    else:
        print("Found TEDLIUM directory, skipping unpacking of tar files")

    train_ted_dir = os.path.join(target_unpacked_dir, "train")
    val_ted_dir = os.path.join(target_unpacked_dir, "dev")
    test_ted_dir = os.path.join(target_unpacked_dir, "test")

    prepare_dir(train_ted_dir)
    prepare_dir(val_ted_dir)
    prepare_dir(test_ted_dir)
    print('Creating manifests...')

    create_manifest(train_ted_dir, 'ted_train_manifest.csv', args.min_duration,
                    args.max_duration)
    create_manifest(val_ted_dir, 'ted_val_manifest.csv')
    create_manifest(test_ted_dir, 'ted_test_manifest.csv')