Exemple #1
0
def do_baseline_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    all_results = []
    for run_idx in range(expt.num_runs):
        test_data = gen.generate_simulated_data(expt.num_test_frames)

        # There's a problem here if there's only one data point, since
        # then we end up with a variance of 0.  We currently hack
        # around this problem by guaranteeing more than one point.  We
        # could change the models to allow zero variance but this will
        # mean not being able to make samples from the models without
        # some extra work.  Note that we don't care at all about order
        # of training data in these experiments, so we just build our
        # training data in two parts and cat them together.  If you
        # hit either of these asserts, you're asking for an error rate
        # that's too hig and/or a training data size that's too low.
        # We need two correct samples per phoneme.
        num_secondary_frames = expt.num_training_frames - expt.num_phonemes * 2
        num_errorful_frames = expt.num_training_frames * expt.training_error_rate
        assert expt.num_training_frames >= expt.num_phonemes * 2
        assert num_secondary_frames > num_errorful_frames
        errorless_training_data = gen.generate_simulated_data_per_phoneme(2)
        secondary_training_data = gen.generate_simulated_data(
            num_secondary_frames)

        # Slight trickiness to get a correct error rate for this subset of the data
        subset_error_rate = float(num_errorful_frames) / num_secondary_frames
        errorful_training_data, num_errors = gen.add_errors_to_data(
            secondary_training_data, subset_error_rate)

        practice_data = gen.generate_simulated_data(expt.num_practice_frames)
        errorful_practice_data, num_errors = gen.add_errors_to_data(
            practice_data, expt.practice_error_rate)

        training_data = errorless_training_data + errorful_training_data + errorful_practice_data

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        (rate, results) = measureAccuracy(c, test_data)
        name = "Baseline 0.%d" % (run_idx, )
        summary = make_summary_string(name, rate, results, c, test_data, gen)
        all_results.append((name, rate))

        # print "Classifier:\n"
        # print c.to_string()
        # print summary
    print "\n--------------------------Summary-----------------------"
    print make_all_runs_summary_string(expt, all_results)
Exemple #2
0
def do_baseline_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    all_results = []
    for run_idx in range(expt.num_runs):
        test_data = gen.generate_simulated_data(expt.num_test_frames)

        # There's a problem here if there's only one data point, since
        # then we end up with a variance of 0.  We currently hack
        # around this problem by guaranteeing more than one point.  We
        # could change the models to allow zero variance but this will
        # mean not being able to make samples from the models without
        # some extra work.  Note that we don't care at all about order
        # of training data in these experiments, so we just build our
        # training data in two parts and cat them together.  If you
        # hit either of these asserts, you're asking for an error rate
        # that's too hig and/or a training data size that's too low.
        # We need two correct samples per phoneme.
        num_secondary_frames  =  expt.num_training_frames - expt.num_phonemes * 2
        num_errorful_frames = expt.num_training_frames * expt.training_error_rate 
        assert expt.num_training_frames >= expt.num_phonemes * 2
        assert num_secondary_frames >  num_errorful_frames
        errorless_training_data = gen.generate_simulated_data_per_phoneme(2)
        secondary_training_data = gen.generate_simulated_data(num_secondary_frames)

        # Slight trickiness to get a correct error rate for this subset of the data
        subset_error_rate = float(num_errorful_frames) / num_secondary_frames
        errorful_training_data, num_errors = gen.add_errors_to_data(secondary_training_data, subset_error_rate)

        practice_data = gen.generate_simulated_data(expt.num_practice_frames)
        errorful_practice_data, num_errors = gen.add_errors_to_data(practice_data, expt.practice_error_rate)

        training_data = errorless_training_data + errorful_training_data + errorful_practice_data

        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        (rate, results) = measureAccuracy(c, test_data)
        name = "Baseline 0.%d" % (run_idx,)
        summary = make_summary_string(name, rate, results, c, test_data, gen)
        all_results.append((name, rate))

        # print "Classifier:\n"
        # print c.to_string()
        # print summary
    print "\n--------------------------Summary-----------------------"
    print make_all_runs_summary_string(expt, all_results)
Exemple #3
0
def do_ddt_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    perfect_practice_data = gen.generate_simulated_data(expt.num_practice_frames)
    practice_data, num_practice_errors = gen.add_errors_to_data(perfect_practice_data, expt.practice_error_rate)
    practice_data_dict = partition_data(practice_data)
    # We got some practice data for every point, right?
    assert( len(practice_data_dict.keys() == expt.num_phonemes))

    test_data = gen.generate_simulated_data(expt.num_test_frames)

    n = expt.num_training_frames
    assert( n * expt.training_error_rate >= 5)   # number of errorful points
    assert( n * (1-expt.training_error_rate) > 5)  # number of correct points
    error_training_frame_indices = range(0,5)
    correct_training_frame_indices = range(n-5, n)

    all_results = {}
    all_results['Error'] = []
    all_results['Correct'] = []
    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        def run_some_frames(frame_indices):
            frame_results = []
            for i in frame_indices:
                label = training_data[i][0]
                a = SimpleAllele(c, [label])
            
                # subtract (label, frame) from training_data for active phoneme
                alt_data = training_data[:i] + training_data[i+1:]
            
                # train alternate model in allele on alternate data
                a.train_variants(alt_data)
                # print a.make_details_string()

                # Construct a subset of the practice data with only the points
                # which are labelled with the active label of the allele (see comments below).
                data = [(label, point) for point in practice_data_dict[label]]
                results = measurePrimaryAndVariantAccuracy(a, data)

                # KJB - here's the original version, in which we just
                # used all the practice data This essential means we
                # aren't using the practice data labels at all, which
                # might be an interesting variation, but isn't the
                # original intention.
                #results = measurePrimaryAndVariantAccuracy(a, practice_data)

                frame_results.append(results)
            return frame_results

        error_results = run_some_frames(error_training_frame_indices)
        all_results['Error'].append(error_results)
        correct_results = run_some_frames(correct_training_frame_indices)
        all_results['Correct'].append(correct_results)
    return all_results
Exemple #4
0
def do_ddt_runs(expt):
    gen = DataGenerator(expt.num_phonemes, expt.num_features,
                        expt.var_diag_interval, expt.var_offdiag_interval)

    perfect_practice_data = gen.generate_simulated_data(
        expt.num_practice_frames)
    practice_data, num_practice_errors = gen.add_errors_to_data(
        perfect_practice_data, expt.practice_error_rate)
    practice_data_dict = partition_data(practice_data)
    # We got some practice data for every point, right?
    assert (len(practice_data_dict.keys() == expt.num_phonemes))

    test_data = gen.generate_simulated_data(expt.num_test_frames)

    n = expt.num_training_frames
    assert (n * expt.training_error_rate >= 5)  # number of errorful points
    assert (n * (1 - expt.training_error_rate) > 5)  # number of correct points
    error_training_frame_indices = range(0, 5)
    correct_training_frame_indices = range(n - 5, n)

    all_results = {}
    all_results['Error'] = []
    all_results['Correct'] = []
    for run_idx in range(0, expt.num_runs):
        training_data, num_errors = make_training_data(gen, expt)
        c = SimpleClassifier(gen.get_labels(), gen.num_features)
        c.train_all(training_data)

        def run_some_frames(frame_indices):
            frame_results = []
            for i in frame_indices:
                label = training_data[i][0]
                a = SimpleAllele(c, [label])

                # subtract (label, frame) from training_data for active phoneme
                alt_data = training_data[:i] + training_data[i + 1:]

                # train alternate model in allele on alternate data
                a.train_variants(alt_data)
                # print a.make_details_string()

                # Construct a subset of the practice data with only the points
                # which are labelled with the active label of the allele (see comments below).
                data = [(label, point) for point in practice_data_dict[label]]
                results = measurePrimaryAndVariantAccuracy(a, data)

                # KJB - here's the original version, in which we just
                # used all the practice data This essential means we
                # aren't using the practice data labels at all, which
                # might be an interesting variation, but isn't the
                # original intention.
                #results = measurePrimaryAndVariantAccuracy(a, practice_data)

                frame_results.append(results)
            return frame_results

        error_results = run_some_frames(error_training_frame_indices)
        all_results['Error'].append(error_results)
        correct_results = run_some_frames(correct_training_frame_indices)
        all_results['Correct'].append(correct_results)
    return all_results