Esempio n. 1
0
 def run(self):
     D = training_instances.get_generation_instances(
         filenames=self.filenames)
     splits = None
     if self.test_size:
         splits = ShuffleSplit(n=len(D),
                               n_iter=self.cv,
                               test_size=self.test_size)
     else:
         splits = KFold(n=len(D), n_folds=self.cv, shuffle=True)
     cross_val_results = defaultdict(list)
     for fold_index, (train_indices, test_indices) in enumerate(splits):
         train = [D[i] for i in train_indices]
         test = [D[i] for i in test_indices]
         params = self.set_hyperparameters(train)
         run_results = self.crossval_run(test,
                                         params,
                                         fold_index=fold_index)
         print "======================================================================"
         print params
         print run_results
         for key, val in run_results.items():
             cross_val_results[key].append(val)
     for key, vals in sorted(cross_val_results.items()):
         modelname, metricname = key
         lower, upper = confidence_interval(vals)
         print "%s mean %s: %0.03f (%0.03f-%0.03f)" % (
             modelname, metricname, np.mean(vals), lower, upper)
     pickle.dump(self.log, file(self.logfile, 'w'), 2)
Esempio n. 2
0
def pooled_experiment(agentname='literal'):
    # Collapse across folds:
    results = defaultdict(lambda: defaultdict(list))
    for dirname in ('furniture', 'people'):
        log = pickle.load(file("logs/log_%s.pickle" % dirname))
        for d in log:
            fold = d['fold_index']
            acc = d[agentname]['evaluations']['instance_accuracy']
            dice = d[agentname]['evaluations']['multiset_dice']
            results[fold]['instance_accuracy'].append(acc)
            results[fold]['multiset_dice'].append(dice)
    # Means for the folds:
    pooled = defaultdict(dict)
    for fold, metric_vals in results.items():
        for metric, vals in metric_vals.items():
            pooled[metric][fold] = np.mean(vals)
    # Stats across the folds:
    runs = {}
    for metric, fold_dict in pooled.items():
        fold_vals = np.array(fold_dict.values())
        mu = np.mean(fold_vals)
        upper, lower = confidence_interval(fold_vals)
        print '%s mean %s: %0.03f (ci %0.03f, %0.03f)' % (agentname, metric,
                                                          mu, upper, lower)
        runs[metric] = fold_vals
    return runs
Esempio n. 3
0
def predicted_vs_actual_length(log, agentname='pragmatic'):
    deltas = []
    for d in log:
        results = d[agentname]
        delta = len(results['prediction']) - len(results['actual'])
        deltas.append(delta)
    upper, lower = confidence_interval(deltas)
    print '%s mean difference: %0.02f (%0.02f, %0.02f 95%% ci)' % (
        agentname, np.mean(deltas), upper, lower)
Esempio n. 4
0
 def evaluation_report(self, all_results, verbose=0, split_info=None):
     errors = np.array([d['error'] for d in all_results])
     iterations = np.array([d['iterations'] for d in all_results])
     print "======================================================================"
     print "Type: %s" % self.typ
     print "Domain: %s" % self.dirname
     print "Features: %s" % self.phi.__name__
     print split_info
     print "Learning rate: %s" % self.eta
     print "L2 coefs:", [r['l2_coeff'] for r in all_results]
     print "Mean iterations to convergence:  %0.3f (+/- %0.3f)" % (
         iterations.mean(), iterations.std() * 2)
     for metric in self.metrics:
         vals = np.array(
             [d['evaluations'][metric.__name__] for d in all_results])
         ci = confidence_interval(vals)
         print "Mean %s: %0.3f (%.3f--%.3f)" % (metric.__name__,
                                                vals.mean(), ci[0], ci[1])
Esempio n. 5
0
 def crossvalidate(self):
     kf = KFold(n=len(self.filenames), n_folds=self.cv, shuffle=True)
     summaries = []
     temps = []
     for train_indices, test_indices in kf:
         train = [self.filenames[i] for i in train_indices]
         temp, nullcost = self.set_hyperparameters(train)
         test = [self.filenames[i] for i in test_indices]
         all_reports = self.run(test, temperature=temp, nullcost=nullcost)
         summary = self.summarize(all_reports)
         summaries.append(summary)
         temps.append(temp)
         print 'Temp: %s; nullcost: %s; %s' % (temp, nullcost, str(summary))
     for name in ('Literal', 'Pragmatic', 'Speaker'):
         vals = np.array([s[name] for s in summaries])
         ci = confidence_interval(vals)
         print "%s mean accuracy: %0.2f (%0.2f-%0.2f)" % (name, vals.mean(),
                                                          ci[0], ci[1])
Esempio n. 6
0
def triple_errors(output_folder, triple):
    from parsers import CVOutputParser
    from utils import interpolate, avg, confidence_interval
    import math
    from collections import Counter
    import os

    """ 
    Plot accumulated errors for estimators against pair triple ratios.
    Ratios are binned in the range 0.0 to 1.0.
    """
    if not output_folder[-1] == "/":
        output_folder += "/"

    iteration = -1
    max_ent_errors = []
    ext_errors = []
    max_ent_abs_errors = []
    ext_abs_errors = []
    samples_ignored = 0
    while True:
        iteration += 1
        max_ent_est_file = output_folder + str(iteration) + "_data.tsv"
        ext_est_file = output_folder + str(iteration) + "_data_extrapolation.tsv"
        # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
        # read baseline also?
        # Read until we do not find an output file
        if not os.path.exists(max_ent_est_file):
            break

        # Read the maxent estimate
        found = False
        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(
            max_ent_est_file
        ):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                # if s123 == 0:
                #     break
                found = True
                max_ent_errors.append(est - obs)
                max_ent_abs_errors.append(abs(obs - est))
                break

        if not found:
            samples_ignored += 1
            continue

        for sample_triple, (est, obs, ratio, triangle) in CVOutputParser.read_est_obs_file_disc_version_2(ext_est_file):
            (s1, s2, s3, s12, s13, s23, s123) = triangle

            if sample_triple == triple:
                ext_errors.append(est - obs)
                ext_abs_errors.append(abs(obs - est))
                break

    # maxent confidence interval
    maxent_ci = confidence_interval(max_ent_errors)
    # extrapolation confidence interval
    ext_ci = confidence_interval(ext_errors)

    print "samples ignored: ", samples_ignored
    print "maxent avg error: ", round(avg(max_ent_errors), 1)
    print "maxent 95% confidence interval: ", (round(maxent_ci[0], 1), round(maxent_ci[1], 2))
    print "extrapolation avg error: ", round(avg(ext_errors), 1)
    print "extrapolation 95% confidence interval: ", (round(ext_ci[0], 1), round(ext_ci[1], 2))

    # round
    max_ent_errors_rounded = [round(x, 1) for x in max_ent_errors]
    ext_errors_rounded = [round(x, 1) for x in ext_errors]

    # plot
    xlabel("Estimate error")
    ylabel("Bucket size")
    # text(0.1, 0.8, 'Maxent')
    # text(0.1, 0.7, 'avg. error: ' + str(avg(max_ent_errors)))
    # text(0.1, 0.6, '95% conf. interval: ' + str(maxent_ci))

    # text(0.5, 0.8, 'Extrapolation')
    # text(0.5, 0.7, 'avg. error: ' + str(avg(ext_errors)))
    # text(0.5, 0.6, '95% conf. interval: ' + str(ext_ci))

    hist([max_ent_errors_rounded, ext_errors_rounded], color=("b", "r"))

    return max_ent_errors, max_ent_abs_errors, ext_errors, ext_abs_errors
	def test_confidence_interval(self):
		assert utils.confidence_interval([2,2,2,2]) == 0
		nose.tools.assert_almost_equal(utils.confidence_interval([1,2,3,4]), 1.096,3)
		assert utils.confidence_interval([2,2,4,4]) == 0.98