Beispiel #1
0
def test_create_corpus_basic(tmpdir, create_sine, make_wav):
    """Test that an attempt to create a Corpus object with a minimal data set"""
    from persephone.corpus import Corpus
    from pathlib import Path

    wav_dir = tmpdir.mkdir("wav")
    label_dir = tmpdir.mkdir("label")

    #create sine wave data
    data_a = create_sine(note="A")
    data_b = create_sine(note="B")
    data_c = create_sine(note="C")

    wav_test = wav_dir.join("test.wav")
    make_wav(data_a, str(wav_test))
    wav_train = wav_dir.join("train.wav")
    make_wav(data_b, str(wav_train))
    wav_valid = wav_dir.join("valid.wav")
    make_wav(data_c, str(wav_valid))

    label_test = label_dir.join("test.phonemes").write("a")
    label_train = label_dir.join("train.phonemes").write("b")
    label_valid = label_dir.join("valid.phonemes").write("c")

    c = Corpus(feat_type='fbank',
               label_type='phonemes',
               tgt_dir=Path(str(tmpdir)),
               labels=None)
    assert c
Beispiel #2
0
def test_create_corpus_label_mismatch(tmpdir):
    """Test that creation of a Corpus raises an error when the supplied label set
    does not exactly match those found in the provided data"""
    from persephone.corpus import Corpus
    from persephone.exceptions import LabelMismatchException
    from pathlib import Path

    wav_dir = tmpdir.mkdir("wav")
    label_dir = tmpdir.mkdir("label")

    wav_test = wav_dir.join("test.wav").write("")
    wav_train = wav_dir.join("train.wav").write("")
    wav_valid = wav_dir.join("valid.wav").write("")

    label_test = label_dir.join("test.phonemes").write("a")
    label_train = label_dir.join("train.phonemes").write("b")
    label_valid = label_dir.join("valid.phonemes").write("c")

    # TODO: write prefix files

    with pytest.raises(LabelMismatchException):
        c = Corpus(feat_type='fbank',
                   label_type='phonemes',
                   tgt_dir=Path(str(tmpdir)),
                   labels={"1", "2", "3"})
Beispiel #3
0
def prepared_data(request):
    data_path = Path("testing/data/bkw")
    exp_path = Path("testing/exp/19/")

    # TODO I shouldn't really be using "decoded" for the validation set dir
    # anymore.
    if request.param == "test":
        hyps_path = exp_path / "test" / "hyps"
        refs_path = exp_path / "test" / "refs"
    else:
        hyps_path = exp_path / "decoded" / "best_hyps"
        refs_path = exp_path / "decoded" / "refs"

    with hyps_path.open() as f:
        hyps = [hyp.split() for hyp in f.readlines()]
    with refs_path.open() as f:
        refs = [hyp.split() for hyp in f.readlines()]

    corp = Corpus.from_pickle(data_path)
    if request.param == "test":
        eval_prefixes = corp.test_prefixes
    else:
        eval_prefixes = corp.valid_prefixes

    return request.param, corp, eval_prefixes, hyps, refs
Beispiel #4
0
    def _create_corpus():
        from persephone.corpus import Corpus

        wav_dir = tmpdir.mkdir("wav")
        label_dir = tmpdir.mkdir("label")

        #create sine wave data
        data_a = create_note_sequence(notes=["A"])
        data_b = create_note_sequence(notes=["B"])
        data_c = create_note_sequence(notes=["C"])
        data_a_b = create_note_sequence(notes=["A","B"])
        data_b_c = create_note_sequence(notes=["B","C"])
        data_a_b_c = create_note_sequence(notes=["A","B","C"])

        #testing
        wav_test1 = wav_dir.join("test1.wav")
        make_wav(data_a_b, str(wav_test1))
        label_test1 = label_dir.join("test1.phonemes").write("A B")

        wav_test2 = wav_dir.join("test2.wav")
        make_wav(data_c, str(wav_test2))
        label_test2 = label_dir.join("test2.phonemes").write("C")

        #training
        wav_train1 = wav_dir.join("train1.wav")
        make_wav(data_b_c, str(wav_train1))
        label_train1 = label_dir.join("train1.phonemes").write("B C")

        wav_train2 = wav_dir.join("train2.wav")
        make_wav(data_a_b_c, str(wav_train2))
        label_train2 = label_dir.join("train2.phonemes").write("A B C")

        #validation
        wav_valid = wav_dir.join("valid.wav")
        make_wav(data_c, str(wav_valid))

        label_valid = label_dir.join("valid.phonemes").write("C")

        # Prefixes handling
        test_prefixes = tmpdir.join("test_prefixes.txt").write("test1\ntest2")
        train_prefixes = tmpdir.join("train_prefixes.txt").write("train1\ntrain2")
        valid_prefixes = tmpdir.join("valid_prefixes.txt").write("valid")

        c = Corpus(
            feat_type='fbank',
            label_type='phonemes',
            tgt_dir=Path(str(tmpdir)),
            labels={"A","B","C"}
        )
        assert c
        assert c.feat_type == 'fbank'
        assert c.label_type == 'phonemes'
        assert set(c.labels) == {"A", "B", "C"}
        assert c.vocab_size == 3
        return c
Beispiel #5
0
def test_missing_experiment_dir():
    """A Corpus needs an experiment directory, check an exception is thrown
    if the directory doesn't exist"""
    from pathlib import Path
    from persephone.corpus import Corpus

    with pytest.raises(FileNotFoundError):
        Corpus(feat_type='fbank',
               label_type='phonemes',
               tgt_dir=Path("thisDoesNotExist"),
               labels={"a", "b", "c"})
Beispiel #6
0
def test_missing_wav_dir(tmpdir):
    """Test that a missing wav dir raises an error"""
    from pathlib import Path
    from persephone.corpus import Corpus
    from persephone.exceptions import PersephoneException

    with pytest.raises(PersephoneException):
        Corpus(feat_type='fbank',
               label_type='phonemes',
               tgt_dir=Path(str(tmpdir)),
               labels={"a", "b", "c"})
Beispiel #7
0
def test_create_corpus_no_data(tmpdir):
    """Test that an attempt to create a Corpus object with no data raises an
    exception warning us that there's no data"""
    from persephone.corpus import Corpus
    from pathlib import Path

    wav_dir = tmpdir.mkdir("wav")
    label_dir = tmpdir.mkdir("label")

    from persephone.exceptions import PersephoneException

    with pytest.raises(PersephoneException):
        c = Corpus(feat_type='fbank',
                   label_type='phonemes',
                   tgt_dir=Path(str(tmpdir)),
                   labels={"a", "b", "c"})
Beispiel #8
0
def test_corpus_with_predefined_data_sets(tmpdir, create_sine, make_wav):
    """Test that corpus construction works with prefix data splits determined
    as per the file system conventions.

    This will check that what is specified in :
    * `test_prefixes.txt`
    * `train_prefixes.txt`
    * `valid_prefixes.txt`
    Matches the internal members that store the prefix information
    """
    from persephone.corpus import Corpus
    from pathlib import Path

    wav_dir = tmpdir.mkdir("wav")
    label_dir = tmpdir.mkdir("label")

    #create sine wave data
    data_a = create_sine(note="A")
    data_b = create_sine(note="B")
    data_c = create_sine(note="C")

    wav_test = wav_dir.join("test.wav")
    make_wav(data_a, str(wav_test))
    wav_train = wav_dir.join("train.wav")
    make_wav(data_b, str(wav_train))
    wav_valid = wav_dir.join("valid.wav")
    make_wav(data_c, str(wav_valid))

    label_test = label_dir.join("test.phonemes").write("a")
    label_train = label_dir.join("train.phonemes").write("b")
    label_valid = label_dir.join("valid.phonemes").write("c")

    test_prefixes = tmpdir.join("test_prefixes.txt").write("test")
    train_prefixes = tmpdir.join("train_prefixes.txt").write("train")
    valid_prefixes = tmpdir.join("valid_prefixes.txt").write("valid")

    c = Corpus(feat_type='fbank',
               label_type='phonemes',
               tgt_dir=Path(str(tmpdir)),
               labels={"a", "b", "c"})
    assert c
    assert c.feat_type == 'fbank'
    assert c.label_type == 'phonemes'
    assert set(c.labels) == {"a", "b", "c"}
    assert c.vocab_size == 3
Beispiel #9
0
def post(corpusInfo):
    """Create a DBcorpus"""
    INT64_MAX = 2 ^ 63 - 1  # Largest size that the 64bit integer value for the max_samples
    # can contain, this exists because the API will complain if a None
    # is returned, so we get much the same behavior by making the default
    # value the integer max value

    max_samples = corpusInfo.get('max_samples', INT64_MAX)
    current_corpus = DBcorpus(name=corpusInfo['name'],
                              labelType=corpusInfo['labelType'],
                              featureType=corpusInfo['featureType'])
    current_corpus.max_samples = max_samples
    db.session.add(current_corpus)
    db.session.flush(
    )  # Make sure that current_corpus.id exists before using as key
    training_set_IDs = corpusInfo['training']
    for train_utterance_id in training_set_IDs:
        db.session.add(
            TrainingDataSet(corpus_id=current_corpus.id,
                            utterance_id=train_utterance_id))

    testing_set_IDs = corpusInfo['testing']
    for test_utterance_id in testing_set_IDs:
        db.session.add(
            TestingDataSet(corpus_id=current_corpus.id,
                           utterance_id=test_utterance_id))

    validation_set_IDs = corpusInfo['validation']
    for validation_utterance_id in validation_set_IDs:
        db.session.add(
            ValidationDataSet(corpus_id=current_corpus.id,
                              utterance_id=validation_utterance_id))

    #Saving Corpus as UUIDs to remove name collision issues
    corpus_uuid = uuid.uuid1()
    corpus_path = Path(
        flask.current_app.config['CORPUS_PATH']) / str(corpus_uuid)
    audio_uploads_path = Path(flask.current_app.config['UPLOADED_AUDIO_DEST'])
    transcription_uploads_path = Path(
        flask.current_app.config['UPLOADED_TEXT_DEST'])
    create_corpus_file_structure(audio_uploads_path,
                                 transcription_uploads_path, current_corpus,
                                 corpus_path)
    current_corpus.filesystem_path = str(
        corpus_uuid
    )  # see if there's some other way of handling a UUID value directly into SQLAlchemy
    db.session.add(current_corpus)

    # Creating the corpus object has the side-effect of creating a directory located at the path
    # given to `tgt_dir`
    persephone_corpus = Corpus(
        feat_type=current_corpus.featureType,
        label_type=current_corpus.labelType,
        tgt_dir=corpus_path,
    )
    labels = persephone_corpus.labels
    # Make any labels that don't currently exist in the Label table
    for l in labels:
        current_label = Label(label=l)
        db.session.add(current_label)
        # Make CorpusLabelSet entry

        db.session.add(
            CorpusLabelSet(corpus=current_corpus, label=current_label))
    try:
        db.session.commit()
    except sqlalchemy.exc.IntegrityError:
        return "Invalid corpus provided", 400
    else:
        result = fix_corpus_format(CorpusSchema().dump(current_corpus).data)
        return result, 201