コード例 #1
0
def main(args):
    """Main entry point allowing external calls
    Args:
      args ([str]): command line parameter list
    """
    args = parse_args(args)
    validate_label = get_validate_label(args)
    setup_logging(args.loglevel)
    _logger.info("Starting GramVaani importer...")
    _logger.info("Starting loading GramVaani csv...")
    csv = GramVaaniCSV(args.csv_filename)
    _logger.info("Starting downloading GramVaani mp3's...")
    downloader = GramVaaniDownloader(csv, args.target_dir)
    mp3_directory = downloader.download()
    _logger.info("Starting converting GramVaani mp3's to wav's...")
    converter = GramVaaniConverter(args.target_dir, mp3_directory)
    wav_directory = converter.convert()
    datasets = GramVaaniDataSets(args.target_dir, wav_directory, csv)
    datasets.create()
    datasets.save()
    _logger.info("Finished GramVaani importer...")
コード例 #2
0
ファイル: import_ts.py プロジェクト: lper1582/DeepSpeech-Indo
PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]")
MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')


def cleanup_transcript(text, english_compatible=False):
    text = text.replace('’', "'").replace('\u00A0', ' ')
    text = PUNCTUATIONS_REG.sub(' ', text)
    text = MULTIPLE_SPACES_REG.sub(' ', text)
    if english_compatible:
        text = unidecode.unidecode(text)
    return text.strip().lower()


def handle_args():
    parser = get_importers_parser(
        description='Importer for TrainingSpeech dataset.')
    parser.add_argument(dest='target_dir')
    parser.add_argument('--english-compatible',
                        action='store_true',
                        dest='english_compatible',
                        help='Remove diactrics and other non-ascii chars.')
    return parser.parse_args()


if __name__ == "__main__":
    cli_args = handle_args()
    validate_label = get_validate_label(cli_args)
    _download_and_preprocess_data(cli_args.target_dir,
                                  cli_args.english_compatible)
コード例 #3
0
        '--audio_dir',
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
    PARSER.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    PARSER.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    PARSER.add_argument('--space_after_every_character',
                        action='store_true',
                        help='To help transcript join by white space')

    PARAMS = PARSER.parse_args()
    validate_label = get_validate_label(PARAMS)

    AUDIO_DIR = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(
        PARAMS.tsv_dir, 'clips')
    ALPHABET = Alphabet(
        PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None

    def label_filter_fun(label):
        if PARAMS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \
                .encode("ascii", "ignore") \
                .decode("ascii", "ignore")
        label = validate_label(label)
        if ALPHABET and label:
            try:
                ALPHABET.encode(label)
コード例 #4
0
        help='Converts diacritic characters to their base ones')
    parser.add_argument(
        '--bogus-records',
        type=argparse.FileType('r'),
        required=False,
        help=
        'Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items'
    )
    return parser.parse_args()


if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(
        CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
    validate_label = get_validate_label(CLI_ARGS)

    bogus_regexes = []
    if CLI_ARGS.bogus_records:
        for line in CLI_ARGS.bogus_records:
            bogus_regexes.append(re.compile(line.strip()))

    def record_filter(path):
        if any(regex.match(path) for regex in bogus_regexes):
            print('Reject', path)
            return False
        return True

    def label_filter(label):
        if CLI_ARGS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \