def handle_args(): parser = get_importers_parser( description= "Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details." ) parser.add_argument(dest="target_dir") parser.add_argument("--qId", type=int, required=True, help="LinguaLibre language qId") parser.add_argument("--iso639-3", type=str, required=True, help="ISO639-3 language code") parser.add_argument("--english-name", type=str, required=True, help="Enligh name of the language") parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--bogus-records", type=argparse.FileType("r"), required=False, help= "Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items", ) return parser.parse_args()
def main(): parser = get_importers_parser( description="Import CommonVoice v2.0 corpora") parser.add_argument("tsv_dir", help="Directory containing tsv files") parser.add_argument( "--audio_dir", help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"', ) parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--space_after_every_character", action="store_true", help="To help transcript join by white space", ) params = parser.parse_args() validate_label = get_validate_label(params) audio_dir = (params.audio_dir if params.audio_dir else os.path.join( params.tsv_dir, "clips")) alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None filter_obj = LabelFilter(params.normalize, alphabet, validate_label) _preprocess_data(params.tsv_dir, audio_dir, filter_obj, params.space_after_every_character)
def handle_args(): parser = get_importers_parser( description= "Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/." ) parser.add_argument(dest="target_dir") parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--skiplist", type=str, default="", help="Directories / books to skip, comma separated", ) parser.add_argument("--language", required=True, type=str, help="Dataset language to use") return parser.parse_args()
def handle_args(): parser = get_importers_parser(description="Importer for TrainingSpeech dataset.") parser.add_argument(dest="target_dir") parser.add_argument( "--english-compatible", action="store_true", dest="english_compatible", help="Remove diactrics and other non-ascii chars.", ) return parser.parse_args()
def handle_args(): parser = get_importers_parser( description="Importer for African Accented French dataset. More information on http://www.openslr.org/57/." ) parser.add_argument(dest="target_dir") parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) return parser.parse_args()
def main(): # http://www.openslr.org/33/ parser = get_importers_parser(description="Import AISHELL corpus") parser.add_argument("aishell_tgz_file", help="Path to data_aishell.tgz") parser.add_argument( "--target_dir", default="", help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.", ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.aishell_tgz_file) preprocess_data(params.aishell_tgz_file, params.target_dir)
def parse_args(args): """Parse command line parameters Args: args ([str]): Command line parameters as list of strings Returns: :obj:`argparse.Namespace`: command line parameters namespace """ parser = get_importers_parser( description="Imports GramVaani data for Deep Speech") parser.add_argument( "--version", action="version", version="GramVaaniImporter {ver}".format(ver=__version__), ) parser.add_argument( "-v", "--verbose", action="store_const", required=False, help="set loglevel to INFO", dest="loglevel", const=logging.INFO, ) parser.add_argument( "-vv", "--very-verbose", action="store_const", required=False, help="set loglevel to DEBUG", dest="loglevel", const=logging.DEBUG, ) parser.add_argument( "-c", "--csv_filename", required=True, help="Path to the GramVaani csv", dest="csv_filename", ) parser.add_argument( "-t", "--target_dir", required=True, help="Directory in which to save the importer GramVaani data", dest="target_dir", ) return parser.parse_args(args)
def main(): # https://www.openslr.org/47/ parser = get_importers_parser( description="Import Primewords Chinese corpus set 1") parser.add_argument("tgz_file", help="Path to primewords_md_2018_set1.tar.gz") parser.add_argument( "--target_dir", default="", help= "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.", ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.tgz_file) preprocess_data(params.tgz_file, params.target_dir)
def main(): # https://www.openslr.org/38/ parser = get_importers_parser( description="Import Free ST Chinese Mandarin corpus") parser.add_argument("tgz_file", help="Path to ST-CMDS-20170001_1-OS.tar.gz") parser.add_argument( "--target_dir", default="", help= "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.", ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.tgz_file) preprocess_data(params.tgz_file, params.target_dir)
def main(): # https://openslr.org/68/ parser = get_importers_parser(description="Import MAGICDATA corpus") parser.add_argument( "folder_with_archives", help="Path to folder containing magicdata_{train,dev,test}.tar.gz", ) parser.add_argument( "--target_dir", default="", help= "Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives", ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.join(params.folder_with_archives, "magicdata") preprocess_data(params.folder_with_archives, params.target_dir)
def parse_args(): parser = get_importers_parser(description="Import CommonVoice v2.0 corpora") parser.add_argument("tsv_dir", help="Directory containing tsv files") parser.add_argument( "--audio_dir", help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"', ) parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--space_after_every_character", action="store_true", help="To help transcript join by white space", ) return parser.parse_args()
writer = train_writer writer.writerow({ "wav_filename": item[0], "wav_filesize": item[1], "transcript": item[2] }) print("") print("~~~~ FINAL STATISTICS ~~~~") print_import_report(_counter, SAMPLE_RATE, MAX_SECS) print("~~~~ (FINAL STATISTICS) ~~~~") print("") if __name__ == "__main__": PARSER = get_importers_parser( description="Import XML from Conference Centre for Economics, France") PARSER.add_argument("target_dir", help="Destination directory") PARSER.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet") PARSER.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones") PARAMS = PARSER.parse_args() validate_label = get_validate_label(PARAMS) ALPHABET = Alphabet( PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None def label_filter_fun(label):
print_import_report(counter, SAMPLE_RATE, MAX_SECS) def _maybe_convert_wav(mp3_filename, wav_filename): if not os.path.exists(wav_filename): transformer = sox.Transformer() transformer.convert(samplerate=SAMPLE_RATE) try: transformer.build(mp3_filename, wav_filename) except sox.core.SoxError: pass if __name__ == "__main__": PARSER = get_importers_parser( description="Import CommonVoice v2.0 corpora") PARSER.add_argument("tsv_dir", help="Directory containing tsv files") PARSER.add_argument( "--audio_dir", help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"', ) PARSER.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) PARSER.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", )