Beispiel #1
0
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir):
    temp = os.path.join(temp_dir, 'stereo')
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic'))
    dictionary.write()
    d = Corpus(stereo_corpus_dir, temp)
    d.initialize_corpus(dictionary)
    assert (d.get_feat_dim() == '39')
Beispiel #2
0
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    d = Corpus(basic_corpus_dir, output_directory)
    d.initialize_corpus(dictionary)
    assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    d.write()
    assert(set(d.phones) == set(['sil', 'sp','spn', 'phonea','phoneb','phonec']))
    assert(set(d.positional_nonsil_phones) == set(['phonea_B','phonea_I','phonea_E', 'phonea_S',
                                                    'phoneb_B','phoneb_I','phoneb_E','phoneb_S',
                                                    'phonec_B','phonec_I','phonec_E','phonec_S']))
Beispiel #4
0
def test_basic(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    d.write()
    assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'}
    assert set(d.positional_nonsil_phones) == {
        'phonea_B', 'phonea_I', 'phonea_E', 'phonea_S', 'phoneb_B', 'phoneb_I',
        'phoneb_E', 'phoneb_S', 'phonec_B', 'phonec_I', 'phonec_E', 'phonec_S'
    }
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir):
    temp = os.path.join(temp_dir, 'stereo')
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic'))
    dictionary.write()
    d = Corpus(stereo_corpus_dir, temp)
    d.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(d)
    assert d.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    c = Corpus(basic_corpus_dir, output_directory)
    c.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(c)
    assert c.get_feat_dim(fc) == 39
def test_basic(basic_dict_path, basic_dir, generated_dir):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic"))
    dictionary.write()
    output_directory = os.path.join(generated_dir, "basic")
    d = Corpus(basic_dir, output_directory)
    d.write()
    d.create_mfccs()
    d.setup_splits(dictionary)
    assert d.get_feat_dim() == "39"
Beispiel #8
0
def test_stereo(basic_dict_path, textgrid_directory, generated_dir):
    temp = os.path.join(generated_dir, 'stereo')
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic'))
    dictionary.write()
    d = Corpus(os.path.join(textgrid_directory, 'stereo'), temp)
    d.write()
    d.create_mfccs()
    d.setup_splits(dictionary)
    assert (d.get_feat_dim() == '39')
def test_basic(basic_dict_path, basic_corpus_dir, generated_dir):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    d = Corpus(basic_corpus_dir, output_directory)
    d.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(d)
    assert d.get_feat_dim(fc) == 39
def test_stereo(basic_dict_path, textgrid_directory, generated_dir):
    temp = os.path.join(generated_dir, "stereo")
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic"))
    dictionary.write()
    d = Corpus(os.path.join(textgrid_directory, "stereo"), temp)
    d.write()
    d.create_mfccs()
    d.setup_splits(dictionary)
    assert d.get_feat_dim() == "39"
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir):
    temp = os.path.join(temp_dir, 'stereo')
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic'))
    dictionary.write()
    d = Corpus(stereo_corpus_dir, temp)
    d.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(d)
    assert d.get_feat_dim(fc) == 39
def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    c = Corpus(basic_corpus_txt_dir, output_directory)
    assert len(c.no_transcription_files) == 0
    c.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(c)
    assert c.get_feat_dim(fc) == 39
Beispiel #13
0
def test_basic(basic_dict_path, basic_dir, generated_dir):
    dictionary = Dictionary(basic_dict_path,
                            os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    d = Corpus(basic_dir, output_directory)
    d.write()
    d.create_mfccs()
    d.setup_splits(dictionary)
    assert (d.get_feat_dim() == '39')
def align_corpus(corpus_dir, dict_path, output_directory, temp_dir,
                 output_model_path, args):
    if temp_dir == '':
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(temp_dir)
    corpus_name = os.path.basename(corpus_dir)
    if corpus_name == '':
        corpus_dir = os.path.dirname(corpus_dir)
        corpus_name = os.path.basename(corpus_dir)
    data_directory = os.path.join(temp_dir, corpus_name)
    if args.clean:
        shutil.rmtree(data_directory, ignore_errors=True)
        shutil.rmtree(output_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(output_directory, exist_ok=True)

    dictionary = Dictionary(dict_path, data_directory)
    dictionary.write()
    corpus = Corpus(corpus_dir,
                    data_directory,
                    args.speaker_characters,
                    num_jobs=args.num_jobs)
    print(corpus.speaker_utterance_info())
    corpus.write()
    corpus.create_mfccs()
    corpus.setup_splits(dictionary)
    utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt')
    if os.path.exists(utt_oov_path):
        shutil.copy(utt_oov_path, output_directory)
    oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt')
    if os.path.exists(oov_path):
        shutil.copy(oov_path, output_directory)
    mono_params = {'align_often': not args.fast}
    tri_params = {'align_often': not args.fast}
    tri_fmllr_params = {'align_often': not args.fast}
    a = TrainableAligner(corpus,
                         dictionary,
                         output_directory,
                         temp_directory=data_directory,
                         mono_params=mono_params,
                         tri_params=tri_params,
                         tri_fmllr_params=tri_fmllr_params,
                         num_jobs=args.num_jobs)
    a.verbose = args.verbose
    a.train_mono()
    a.export_textgrids()
    a.train_tri()
    a.export_textgrids()
    a.train_tri_fmllr()
    a.export_textgrids()
    if output_model_path is not None:
        a.save(output_model_path)
Beispiel #15
0
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir):
    temp = os.path.join(temp_dir, 'short_segments')
    dictionary = Dictionary(basic_dict_path, temp)
    dictionary.write()
    corpus = Corpus(shortsegments_corpus_dir, temp)
    corpus.initialize_corpus(dictionary)
    assert (len(corpus.feat_mapping.keys()) == 2)
    assert (len(corpus.utt_speak_mapping.keys()) == 2)
    assert (len(corpus.speak_utt_mapping.keys()) == 1)
    assert (len(corpus.text_mapping.keys()) == 2)
    assert (len(corpus.utt_wav_mapping.keys()) == 1)
    assert (len(corpus.segments.keys()) == 2)
    assert (len(corpus.ignored_utterances) == 1)
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large_subset')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)
    c.initialize_corpus(d)
    sd = c.split_directory()

    fc = FeatureConfig()
    fc.generate_features(c)
    s = c.subset_directory(10, fc)
    assert os.path.exists(sd)
    assert os.path.exists(s)
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large_subset')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)
    c.initialize_corpus(d)
    sd = c.split_directory()

    fc = FeatureConfig()
    fc.generate_features(c)
    s = c.subset_directory(10, fc)
    assert os.path.exists(sd)
    assert os.path.exists(s)
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir):
    temp = os.path.join(temp_dir, 'short_segments')
    dictionary = Dictionary(basic_dict_path, temp)
    dictionary.write()
    corpus = Corpus(shortsegments_corpus_dir, temp)
    corpus.initialize_corpus(dictionary)
    fc = FeatureConfig()
    fc.generate_features(corpus)
    assert len(corpus.feat_mapping.keys()) == 2
    assert len(corpus.utt_speak_mapping.keys()) == 3
    assert len(corpus.speak_utt_mapping.keys()) == 1
    assert len(corpus.text_mapping.keys()) == 3
    assert len(corpus.utt_wav_mapping.keys()) == 1
    assert len(corpus.segments.keys()) == 3
    assert len(corpus.ignored_utterances) == 1
Beispiel #19
0
def test_basic_noposition(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path,
                   os.path.join(generated_dir, 'basic'),
                   position_dependent_phones=False)
    x = d.write()
    assert (set(d.phones) == set(
        ['sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec']))
def align_corpus(corpus_dir, dict_path,  output_directory, temp_dir,
            output_model_path, args):
    if temp_dir == '':
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(temp_dir)
    corpus_name = os.path.basename(corpus_dir)
    if corpus_name == '':
        corpus_dir = os.path.dirname(corpus_dir)
        corpus_name = os.path.basename(corpus_dir)
    data_directory = os.path.join(temp_dir, corpus_name)
    if args.clean:
        shutil.rmtree(data_directory, ignore_errors = True)
        shutil.rmtree(output_directory, ignore_errors = True)

    os.makedirs(data_directory, exist_ok = True)
    os.makedirs(output_directory, exist_ok = True)

    corpus = Corpus(corpus_dir, data_directory, args.speaker_characters, num_jobs = args.num_jobs)
    print(corpus.speaker_utterance_info())
    corpus.write()
    corpus.create_mfccs()
    dictionary = Dictionary(dict_path, data_directory, word_set=corpus.word_set)
    dictionary.write()
    corpus.setup_splits(dictionary)
    utt_oov_path = os.path.join(corpus.split_directory, 'utterance_oovs.txt')
    if os.path.exists(utt_oov_path):
        shutil.copy(utt_oov_path, output_directory)
    oov_path = os.path.join(corpus.split_directory, 'oovs_found.txt')
    if os.path.exists(oov_path):
        shutil.copy(oov_path, output_directory)
    mono_params = {'align_often': not args.fast}
    tri_params = {'align_often': not args.fast}
    tri_fmllr_params = {'align_often': not args.fast}
    a = TrainableAligner(corpus, dictionary, output_directory,
                        temp_directory = data_directory,
                        mono_params = mono_params, tri_params = tri_params,
                        tri_fmllr_params = tri_fmllr_params, num_jobs = args.num_jobs)
    a.verbose = args.verbose
    a.train_mono()
    a.export_textgrids()
    a.train_tri()
    a.export_textgrids()
    a.train_tri_fmllr()
    a.export_textgrids()
    if output_model_path is not None:
        a.save(output_model_path)
Beispiel #21
0
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir,
                           large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)

    c.initialize_corpus(d)
    fc = FeatureConfig()
    fc.generate_features(c)
    speakers = os.listdir(large_prosodylab_format_directory)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)

    shutil.rmtree(output_directory, ignore_errors=True)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2)

    c.initialize_corpus(d)
    fc.generate_features(c)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)
def test_frclitics(frclitics_dict_path, generated_dir):
    d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, 'frclitics'))
    x = d.write()
    assert d.separate_clitics('aujourd') == ['aujourd']
    assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui']
    assert d.separate_clitics('vingt-six') == ['vingt', 'six']
    assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle']
    assert d.separate_clitics('c\'est') == ['c\'est']
    assert d.separate_clitics('purple-people-eater') == ['purple-people-eater']
    assert d.separate_clitics('m\'appele') == ['m\'', 'appele']
    assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic']
    assert d.separate_clitics('flying\'purple-people-eater') == ['flying\'purple-people-eater']
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary):
    output_directory = os.path.join(temp_dir, 'large')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory)

    c.initialize_corpus(d)
    fc = FeatureConfig()
    fc.generate_features(c)
    speakers = os.listdir(large_prosodylab_format_directory)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)

    shutil.rmtree(output_directory, ignore_errors=True)
    d.write()
    c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2)

    c.initialize_corpus(d)
    fc.generate_features(c)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)
def test_frclitics(frclitics_dict_path, generated_dir):
    d = Dictionary(frclitics_dict_path, os.path.join(generated_dir,
                                                     'frclitics'))
    x = d.write()
    assert d.separate_clitics('aujourd') == ['aujourd']
    assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui']
    assert d.separate_clitics('vingt-six') == ['vingt', 'six']
    assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle']
    assert d.separate_clitics('c\'est') == ['c\'est']
    assert d.separate_clitics('purple-people-eater') == ['purple-people-eater']
    assert d.separate_clitics('m\'appele') == ['m\'', 'appele']
    assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic']
    assert d.separate_clitics('flying\'purple-people-eater') == [
        'flying\'purple-people-eater'
    ]
Beispiel #25
0
def test_extra_annotations(extra_annotations_path, generated_dir):
    d = Dictionary(extra_annotations_path,
                   os.path.join(generated_dir, 'extra'))
    assert ('{' in d.graphemes)
    d.write()
def sick_dict(sick_dict_path, generated_dir):
    output_directory = os.path.join(generated_dir, 'sickcorpus')
    dictionary = Dictionary(sick_dict_path, output_directory)
    dictionary.write()
    return dictionary
def test_basic_noposition(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'), position_dependent_phones = False)
    x = d.write()
    assert(set(d.phones) == set(['sil', 'sp','spn', 'phonea','phoneb','phonec']))
Beispiel #28
0
def sick_dict(sick_dict_path, generated_dir):
    output_directory = os.path.join(generated_dir, 'sickcorpus')
    dictionary = Dictionary(sick_dict_path, output_directory)
    dictionary.write()
    return dictionary
def test_extra_annotations(extra_annotations_path, generated_dir):
    d = Dictionary(extra_annotations_path, os.path.join(generated_dir, 'extra'))
    assert('{' in d.graphemes)
    d.write()