Example #1
0
def main():
    parser = get_importers_parser(
        description="Import CommonVoice v2.0 corpora")
    parser.add_argument("tsv_dir", help="Directory containing tsv files")
    parser.add_argument(
        "--audio_dir",
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
    )
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--space_after_every_character",
        action="store_true",
        help="To help transcript join by white space",
    )

    params = parser.parse_args()
    validate_label = get_validate_label(params)

    audio_dir = (params.audio_dir if params.audio_dir else os.path.join(
        params.tsv_dir, "clips"))
    alphabet = Alphabet(
        params.filter_alphabet) if params.filter_alphabet else None

    filter_obj = LabelFilter(params.normalize, alphabet, validate_label)
    _preprocess_data(params.tsv_dir, audio_dir, filter_obj,
                     params.space_after_every_character)
Example #2
0
def init_worker(params):
    global FILTER_OBJ  # pylint: disable=global-statement
    global AUDIO_DIR  # pylint: disable=global-statement
    AUDIO_DIR = params.audio_dir if params.audio_dir else os.path.join(
        params.tsv_dir, "clips")
    validate_label = get_validate_label(params)
    alphabet = Alphabet(
        params.filter_alphabet) if params.filter_alphabet else None
    FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
def main(args):
    """Main entry point allowing external calls
    Args:
      args ([str]): command line parameter list
    """
    args = parse_args(args)
    validate_label = get_validate_label(args)
    setup_logging(args.loglevel)
    _logger.info("Starting GramVaani importer...")
    _logger.info("Starting loading GramVaani csv...")
    csv = GramVaaniCSV(args.csv_filename)
    _logger.info("Starting downloading GramVaani mp3's...")
    downloader = GramVaaniDownloader(csv, args.target_dir)
    mp3_directory = downloader.download()
    _logger.info("Starting converting GramVaani mp3's to wav's...")
    converter = GramVaaniConverter(args.target_dir, mp3_directory)
    wav_directory = converter.convert()
    datasets = GramVaaniDataSets(args.target_dir, wav_directory, csv)
    datasets.create()
    datasets.save()
    _logger.info("Finished GramVaani importer...")
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    return parser.parse_args()


if __name__ == "__main__":
    CLI_ARGS = handle_args()
    ALPHABET = Alphabet(
        CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None
    validate_label = get_validate_label(CLI_ARGS)

    def label_filter(label):
        if CLI_ARGS.normalize:
            label = (unicodedata.normalize("NFKD", label.strip()).encode(
                "ascii", "ignore").decode("ascii", "ignore"))
        label = validate_label(label)
        if ALPHABET and label:
            try:
                ALPHABET.encode(label)
            except KeyError:
                label = None
        return label

    _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
Example #5
0

if __name__ == "__main__":
    PARSER = get_importers_parser(
        description="Import XML from Conference Centre for Economics, France")
    PARSER.add_argument("target_dir", help="Destination directory")
    PARSER.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet")
    PARSER.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones")

    PARAMS = PARSER.parse_args()
    validate_label = get_validate_label(PARAMS)
    ALPHABET = Alphabet(
        PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None

    def label_filter_fun(label):
        if PARAMS.normalize:
            label = unicodedata.normalize("NFKD", label.strip()) \
                .encode("ascii", "ignore") \
                .decode("ascii", "ignore")
        label = maybe_normalize(label)
        label = validate_label(label)
        if ALPHABET and label:
            try:
                ALPHABET.encode(label)
            except KeyError:
                label = None
Example #6
0
PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]")
MULTIPLE_SPACES_REG = re.compile(r"\s{2,}")


def cleanup_transcript(text, english_compatible=False):
    text = text.replace("’", "'").replace("\u00A0", " ")
    text = PUNCTUATIONS_REG.sub(" ", text)
    text = MULTIPLE_SPACES_REG.sub(" ", text)
    if english_compatible:
        text = unidecode.unidecode(text)
    return text.strip().lower()


def handle_args():
    parser = get_importers_parser(description="Importer for TrainingSpeech dataset.")
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--english-compatible",
        action="store_true",
        dest="english_compatible",
        help="Remove diactrics and other non-ascii chars.",
    )
    return parser.parse_args()


if __name__ == "__main__":
    cli_args = handle_args()
    validate_label = get_validate_label(cli_args)
    _download_and_preprocess_data(cli_args.target_dir, cli_args.english_compatible)
Example #7
0
def init_worker(params):
    global FILTER_OBJ  # pylint: disable=global-statement
    validate_label = get_validate_label(params)
    alphabet = Alphabet(
        params.filter_alphabet) if params.filter_alphabet else None
    FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
Example #8
0
 def test_get_validate_label(self):
     args = Namespace(validate_label_locale=from_here(
         'test_data/validate_locale_fra.py'))
     f = get_validate_label(args)
     l = f('toto')
     self.assertEqual(l, 'toto')
Example #9
0
 def test_get_validate_label_missing(self):
     args = Namespace(validate_label_locale=from_here(
         'test_data/validate_locale_ger.py'))
     f = get_validate_label(args)
     self.assertEqual(f, None)
Example #10
0
 def test_validate_label_locale_default(self):
     f = get_validate_label(Namespace(validate_label_locale=None))
     self.assertEqual(f('toto'), 'toto')
     self.assertEqual(f('toto1234'), None)
     self.assertEqual(f('toto1234[{[{[]'), None)