def cross_validate(num_iters, algo_name, corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir): global _NAIVE_CV_GLOBALS corpus_files = get_corpus_files(corpus_dir) shuffle(corpus_files) splits = split_seq(corpus_files, num_iters) _NAIVE_CV_GLOBALS = [ corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name ] pool = multiprocessing.Pool() results = pool.map(cross_validate_inner, range(num_iters)) pool.close() pool.join() def summary(seq): q, s, n = 0.0, 0.0, 0.0 for x in seq: q += x * x s += x n += 1.0 avg = s / n dev = math.sqrt( q / n - avg ** 2 ) return avg *100 , dev * 100 prec = list( (tck + tcu) / (tk + tu) for tck, tcu, tk, tu, _, _ in results ) known_prec = list( tck / tk for tck, tcu, tk, tu, _, _ in results ) unknown_prec = list( tcu / tu for tck, tcu, tk, tu, _, _ in results ) ub_known = list( ubk / tk for tck, tcu, tk, tu, ubk, ubu in results ) ub_unknown = list( ubu / tu for tck, tcu, tk, tu, ubk, ubu in results ) print "RESULT: total precision: {0:.4f}% +- {1:.4f}%".format(*summary(prec)) print "RESULT: by-known precision: {0:.4f}% +- {1:.4f}%".format(*summary(known_prec)) print "RESULT: by-unknown precision: {0:.4f}% +- {1:.4f}%".format(*summary(unknown_prec)) print "RESULT: upper bound by knowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_known)) print "RESULT: upper bound by unknowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_unknown)) print "RESULT: " # Just a separator. print "RESULT: Finished {0} algorithm with {1} tagset".format( algo_name, get_tag_set_by_func(N_func ) ) print "RESULT: Raw: " + repr(results)
def cross_validate_inner(i): corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name = _NAIVE_CV_GLOBALS remove_directory_content(algo_dir) print "Starting {0} fold".format( i ) train_fold_corpus_files = flatten(splits[j] for j in range(num_iters) if i != j) test_corpus_files = flatten(splits[j] for j in range(num_iters) if i == j) morph_analysis_files = [ os.path.join( morph_analysis_dir, os.path.basename( test_file ) ) for test_file in test_corpus_files if os.path.exists( os.path.join( morph_analysis_dir, os.path.basename( test_file ) ) )] algo = None if algo_name == ALGONAMES.BASELINE: algo = NaiveAlgorithm(N_func=N_func) algo.train_from_filelist( train_fold_corpus_files ) elif algo_name == ALGONAMES.HMM: algo = HMMAlgorithm(N_filter_func=N_func) algo.train_model_from_filelist(corpus_files = train_fold_corpus_files ) elif algo_name == ALGONAMES.MEMM: algo = MMEMAlgorithm(N_filter_func=N_func) algo.train_model_file_list(corpus_filelist = train_fold_corpus_files, ambiguity_dir = morph_analysis_dir ) if algo is None: raise Exception("Not supported algorithm {0}".format( algo_name )) print "Finished training. Starting testing phase!" remove_ambiguity_file_list(ambig_filelist=morph_analysis_files, output_dir= algo_dir, algo = algo ) print "Finished working of algo. Starting measuring phase" total_correct_known, total_correct_unknown, total_known, total_unknown, upper_bound_known,upper_bound_unknown = calculate_dir_precision( algo_dir = algo_dir, ambi_dir= morph_analysis_dir, gold_dir = corpus_dir, M = M_strict_mathcher, N = N_func, P = P_no_garbage, errors_context_filename = os.path.join(error_dir, "{1}_errors_context_{0}_{2}.txt".format( i , algo_name, get_tag_set_by_func( N_func ) ) ) ) return (total_correct_known, total_correct_unknown, total_known, total_unknown, upper_bound_known,upper_bound_unknown )