Beispiel #1
0
def test_generator(sick_g2p_model_path, sick_corpus, g2p_sick_output):
    if G2P_DISABLED:
        pytest.skip('No Pynini found')
    model = G2PModel(sick_g2p_model_path)

    assert not model.validate(sick_corpus.word_set)
    assert model.validate(
        [x for x in sick_corpus.word_set if not check_bracketed(x)])
    gen = PyniniDictionaryGenerator(model, sick_corpus.word_set)
    gen.output(g2p_sick_output)
    assert os.path.exists(g2p_sick_output)
Beispiel #2
0
def get_word_set(corpus, include_bracketed=False):
    word_set = corpus.word_set
    decode_error_files = []
    textgrid_read_errors = {}
    for file_path in corpus.transcriptions_without_wavs:
        if file_path.endswith('.lab'):
            try:
                text = load_text(file_path)
            except UnicodeDecodeError:
                decode_error_files.append(file_path)
                continue
            words = parse_transcription(text)
            word_set.update(words)
        else:
            tg = TextGrid()
            try:
                tg.read(file_path)
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                textgrid_read_errors[file_path] = '\n'.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                continue
            for i, ti in enumerate(tg.tiers):
                if ti.name.lower() == 'notes':
                    continue
                if not isinstance(ti, IntervalTier):
                    continue
                for interval in ti:
                    text = interval.mark.lower().strip()
                    words = parse_transcription(text)
                    if not words:
                        continue
                    word_set.update(words)

    if decode_error_files:
        print(
            'WARNING: The following files were not able to be decoded using utf8:\n\n'
            '{}'.format('\n'.join(decode_error_files)))
    if textgrid_read_errors:
        print(
            'WARNING: The following TextGrid files were not able to be read:\n\n'
            '{}'.format('\n'.join(textgrid_read_errors.keys())))
    print(
        'Generating transcriptions for the {} word types found in the corpus...'
        .format(len(word_set)))
    if not include_bracketed:
        word_set = [x for x in word_set if not check_bracketed(x)]
    return word_set
Beispiel #3
0
def generate_dictionary(args):
    print("Generating pronunciations from G2P model")
    if not args.temp_directory:
        temp_dir = TEMP_DIR
        temp_dir = os.path.join(temp_dir, 'G2P')
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    if os.path.isdir(args.input_path):
        input_dir = os.path.expanduser(args.input_path)
        corpus_name = os.path.basename(args.input_path)
        if corpus_name == '':
            args.input_path = os.path.dirname(args.input_path)
            corpus_name = os.path.basename(args.input_path)
        data_directory = os.path.join(temp_dir, corpus_name)

        corpus = AlignableCorpus(input_dir,
                                 data_directory,
                                 num_jobs=args.num_jobs,
                                 use_mp=(not args.disable_mp))

        word_set = get_word_set(corpus, args.include_bracketed)
    else:
        word_set = []
        with open(args.input_path, 'r', encoding='utf8') as f:
            for line in f:
                word_set.extend(line.strip().split())
        if not args.include_bracketed:
            word_set = [x for x in word_set if not check_bracketed(x)]

    if args.g2p_model_path is not None:
        model = G2PModel(args.g2p_model_path,
                         root_directory=os.path.join(temp_dir, 'models'))
        model.validate(word_set)
        gen = Generator(model,
                        word_set,
                        temp_directory=temp_dir,
                        num_jobs=args.num_jobs)
        gen.output(args.output_path)
        model.clean_up()
    else:
        with open(args.output_path, "w", encoding='utf8') as f:
            for word in word_set:
                pronunciation = list(word)
                f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
Beispiel #4
0
def test_check_bracketed():
    """Checks if the brackets are removed correctly and handling an empty string works"""
    word_set = ['uh', '(the)', 'sick', '<corpus>', '[a]', '{cold}', '']
    expected_result = ['uh', 'sick', '']
    assert [x for x in word_set if not check_bracketed(x)] == expected_result