def handle_args(): parser = get_importers_parser( description= 'Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.' ) parser.add_argument(dest='target_dir') parser.add_argument('--qId', type=int, required=True, help='LinguaLibre language qId') parser.add_argument('--iso639-3', type=str, required=True, help='ISO639-3 language code') parser.add_argument('--english-name', type=str, required=True, help='Enligh name of the language') parser.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') parser.add_argument( '--bogus-records', type=argparse.FileType('r'), required=False, help= 'Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items' ) return parser.parse_args()
def handle_args(): parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.') parser.add_argument(dest='target_dir') parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones') parser.add_argument('--skiplist', type=str, default='', help='Directories / books to skip, comma separated') parser.add_argument('--language', required=True, type=str, help='Dataset language to use') return parser.parse_args()
def handle_args(): parser = get_importers_parser( description='Importer for TrainingSpeech dataset.') parser.add_argument(dest='target_dir') parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.') return parser.parse_args()
def handle_args(): parser = get_importers_parser( description= 'Importer for African Accented French dataset. More information on http://www.openslr.org/57/.' ) parser.add_argument(dest='target_dir') parser.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') return parser.parse_args()
def main(): # https://www.openslr.org/62/ parser = get_importers_parser(description='Import aidatatang_200zh corpus') parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz') parser.add_argument( '--target_dir', default='', help= 'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.' ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.tgz_file) preprocess_data(params.tgz_file, params.target_dir)
def parse_args(args): """Parse command line parameters Args: args ([str]): Command line parameters as list of strings Returns: :obj:`argparse.Namespace`: command line parameters namespace """ parser = get_importers_parser( description="Imports GramVaani data for Deep Speech") parser.add_argument( "--version", action="version", version="GramVaaniImporter {ver}".format(ver=__version__), ) parser.add_argument( "-v", "--verbose", action="store_const", required=False, help="set loglevel to INFO", dest="loglevel", const=logging.INFO, ) parser.add_argument( "-vv", "--very-verbose", action="store_const", required=False, help="set loglevel to DEBUG", dest="loglevel", const=logging.DEBUG, ) parser.add_argument( "-c", "--csv_filename", required=True, help="Path to the GramVaani csv", dest="csv_filename", ) parser.add_argument( "-t", "--target_dir", required=True, help="Directory in which to save the importer GramVaani data", dest="target_dir", ) return parser.parse_args(args)
def main(): # https://www.openslr.org/47/ parser = get_importers_parser( description='Import Primewords Chinese corpus set 1') parser.add_argument('tgz_file', help='Path to primewords_md_2018_set1.tar.gz') parser.add_argument( '--target_dir', default='', help= 'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.' ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.tgz_file) preprocess_data(params.tgz_file, params.target_dir)
def main(): # https://www.openslr.org/38/ parser = get_importers_parser( description='Import Free ST Chinese Mandarin corpus') parser.add_argument('tgz_file', help='Path to ST-CMDS-20170001_1-OS.tar.gz') parser.add_argument( '--target_dir', default='', help= 'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.' ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.dirname(params.tgz_file) preprocess_data(params.tgz_file, params.target_dir)
def main(): # https://openslr.org/68/ parser = get_importers_parser(description='Import MAGICDATA corpus') parser.add_argument( 'folder_with_archives', help='Path to folder containing magicdata_{train,dev,test}.tar.gz') parser.add_argument( '--target_dir', default='', help= 'Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives' ) params = parser.parse_args() if not params.target_dir: params.target_dir = os.path.join(params.folder_with_archives, 'magicdata') preprocess_data(params.folder_with_archives, params.target_dir)
print_import_report(counter, SAMPLE_RATE, MAX_SECS) def _maybe_convert_wav(mp3_filename, wav_filename): if not path.exists(wav_filename): transformer = sox.Transformer() transformer.convert(samplerate=SAMPLE_RATE) try: transformer.build(mp3_filename, wav_filename) except sox.core.SoxError: pass if __name__ == "__main__": PARSER = get_importers_parser( description='Import CommonVoice v2.0 corpora') PARSER.add_argument('tsv_dir', help='Directory containing tsv files') PARSER.add_argument( '--audio_dir', help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"') PARSER.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') PARSER.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') PARSER.add_argument('--space_after_every_character', action='store_true', help='To help transcript join by white space')