def handle_args():
    parser = get_importers_parser(
        description=
        "Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details."
    )
    parser.add_argument(dest="target_dir")
    parser.add_argument("--qId",
                        type=int,
                        required=True,
                        help="LinguaLibre language qId")
    parser.add_argument("--iso639-3",
                        type=str,
                        required=True,
                        help="ISO639-3 language code")
    parser.add_argument("--english-name",
                        type=str,
                        required=True,
                        help="Enligh name of the language")
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--bogus-records",
        type=argparse.FileType("r"),
        required=False,
        help=
        "Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items",
    )
    return parser.parse_args()
Ejemplo n.º 2
0
def main():
    parser = get_importers_parser(
        description="Import CommonVoice v2.0 corpora")
    parser.add_argument("tsv_dir", help="Directory containing tsv files")
    parser.add_argument(
        "--audio_dir",
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
    )
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--space_after_every_character",
        action="store_true",
        help="To help transcript join by white space",
    )

    params = parser.parse_args()
    validate_label = get_validate_label(params)

    audio_dir = (params.audio_dir if params.audio_dir else os.path.join(
        params.tsv_dir, "clips"))
    alphabet = Alphabet(
        params.filter_alphabet) if params.filter_alphabet else None

    filter_obj = LabelFilter(params.normalize, alphabet, validate_label)
    _preprocess_data(params.tsv_dir, audio_dir, filter_obj,
                     params.space_after_every_character)
Ejemplo n.º 3
0
def handle_args():
    parser = get_importers_parser(
        description=
        "Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/."
    )
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--skiplist",
        type=str,
        default="",
        help="Directories / books to skip, comma separated",
    )
    parser.add_argument("--language",
                        required=True,
                        type=str,
                        help="Dataset language to use")
    return parser.parse_args()
Ejemplo n.º 4
0
def handle_args():
    parser = get_importers_parser(description="Importer for TrainingSpeech dataset.")
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--english-compatible",
        action="store_true",
        dest="english_compatible",
        help="Remove diactrics and other non-ascii chars.",
    )
    return parser.parse_args()
Ejemplo n.º 5
0
def handle_args():
    parser = get_importers_parser(
        description="Importer for African Accented French dataset. More information on http://www.openslr.org/57/."
    )
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    return parser.parse_args()
Ejemplo n.º 6
0
def main():
    # http://www.openslr.org/33/
    parser = get_importers_parser(description="Import AISHELL corpus")
    parser.add_argument("aishell_tgz_file", help="Path to data_aishell.tgz")
    parser.add_argument(
        "--target_dir",
        default="",
        help="Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.aishell_tgz_file)

    preprocess_data(params.aishell_tgz_file, params.target_dir)
Ejemplo n.º 7
0
def parse_args(args):
    """Parse command line parameters
    Args:
      args ([str]): Command line parameters as list of strings
    Returns:
      :obj:`argparse.Namespace`: command line parameters namespace
    """
    parser = get_importers_parser(
        description="Imports GramVaani data for Deep Speech")
    parser.add_argument(
        "--version",
        action="version",
        version="GramVaaniImporter {ver}".format(ver=__version__),
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_const",
        required=False,
        help="set loglevel to INFO",
        dest="loglevel",
        const=logging.INFO,
    )
    parser.add_argument(
        "-vv",
        "--very-verbose",
        action="store_const",
        required=False,
        help="set loglevel to DEBUG",
        dest="loglevel",
        const=logging.DEBUG,
    )
    parser.add_argument(
        "-c",
        "--csv_filename",
        required=True,
        help="Path to the GramVaani csv",
        dest="csv_filename",
    )
    parser.add_argument(
        "-t",
        "--target_dir",
        required=True,
        help="Directory in which to save the importer GramVaani data",
        dest="target_dir",
    )
    return parser.parse_args(args)
Ejemplo n.º 8
0
def main():
    # https://www.openslr.org/47/
    parser = get_importers_parser(
        description="Import Primewords Chinese corpus set 1")
    parser.add_argument("tgz_file",
                        help="Path to primewords_md_2018_set1.tar.gz")
    parser.add_argument(
        "--target_dir",
        default="",
        help=
        "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
def main():
    # https://www.openslr.org/38/
    parser = get_importers_parser(
        description="Import Free ST Chinese Mandarin corpus")
    parser.add_argument("tgz_file",
                        help="Path to ST-CMDS-20170001_1-OS.tar.gz")
    parser.add_argument(
        "--target_dir",
        default="",
        help=
        "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
Ejemplo n.º 10
0
def main():
    # https://openslr.org/68/
    parser = get_importers_parser(description="Import MAGICDATA corpus")
    parser.add_argument(
        "folder_with_archives",
        help="Path to folder containing magicdata_{train,dev,test}.tar.gz",
    )
    parser.add_argument(
        "--target_dir",
        default="",
        help=
        "Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.join(params.folder_with_archives,
                                         "magicdata")

    preprocess_data(params.folder_with_archives, params.target_dir)
Ejemplo n.º 11
0
def parse_args():
    parser = get_importers_parser(description="Import CommonVoice v2.0 corpora")
    parser.add_argument("tsv_dir", help="Directory containing tsv files")
    parser.add_argument(
        "--audio_dir",
        help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
    )
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--space_after_every_character",
        action="store_true",
        help="To help transcript join by white space",
    )
    return parser.parse_args()
Ejemplo n.º 12
0
                        writer = train_writer
                    writer.writerow({
                        "wav_filename": item[0],
                        "wav_filesize": item[1],
                        "transcript": item[2]
                    })

    print("")
    print("~~~~ FINAL STATISTICS ~~~~")
    print_import_report(_counter, SAMPLE_RATE, MAX_SECS)
    print("~~~~ (FINAL STATISTICS) ~~~~")
    print("")


if __name__ == "__main__":
    PARSER = get_importers_parser(
        description="Import XML from Conference Centre for Economics, France")
    PARSER.add_argument("target_dir", help="Destination directory")
    PARSER.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet")
    PARSER.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones")

    PARAMS = PARSER.parse_args()
    validate_label = get_validate_label(PARAMS)
    ALPHABET = Alphabet(
        PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None

    def label_filter_fun(label):
Ejemplo n.º 13
0
    print_import_report(counter, SAMPLE_RATE, MAX_SECS)


def _maybe_convert_wav(mp3_filename, wav_filename):
    if not os.path.exists(wav_filename):
        transformer = sox.Transformer()
        transformer.convert(samplerate=SAMPLE_RATE)
        try:
            transformer.build(mp3_filename, wav_filename)
        except sox.core.SoxError:
            pass


if __name__ == "__main__":
    PARSER = get_importers_parser(
        description="Import CommonVoice v2.0 corpora")
    PARSER.add_argument("tsv_dir", help="Directory containing tsv files")
    PARSER.add_argument(
        "--audio_dir",
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
    )
    PARSER.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    PARSER.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )