def train_B_corpus(corpus_dir, N_filter_func = N_default): corpus_files = get_corpus_files( corpus_dir ) B = defaultdict(default_float) for file in corpus_files: print "Train B matrix on {0} file".format( file ) calculate_B(B,file, N_filter_func ) B = normalize_B(B) return B
def train_A_corpus(corpus_dir, N_filter_func = N_default): corpus_files = get_corpus_files( corpus_dir ) A = defaultdict(default_float) p = defaultdict(float) for file in corpus_files: print "Train A matrix on {0} file".format( file ) calculate_A(A,p,file, N_filter_func ) A,p = normalize_A_matrix(A,p) return A,p
def calculate_dir_precision(algo_dir, gold_dir, ambi_dir, M , N, P, errors_context_filename): algo_files = get_corpus_files(algo_dir) num = 0 total_unknown = 0 total_known = 0 total_correct_unknown = 0 total_correct_known = 0 total_upperbound_known = 0 total_upperbound_unknown = 0 total_error_stats = defaultdict(float) for algo_file in algo_files: print "Evaluating file {0}".format( algo_file ) ambi_file = os.path.join( ambi_dir, os.path.basename( algo_file ) ) gold_file = os.path.join( gold_dir, os.path.basename( algo_file ) ) if os.path.exists( algo_file ) and os.path.exists( gold_file ): cur_correct_unknown, cur_correct_known, cur_total_unknown, cur_total_known,errors, cur_upper_bound_unknown,cur_upper_bound_known = calculate_precision( file_algo_name= algo_file, file_gold_standart_name= gold_file, file_ambi_name = ambi_file, M=M , N=N, P=P, errors_context_filename = errors_context_filename) total_unknown += cur_total_unknown total_known += cur_total_known total = total_unknown + total_known total_correct_unknown += cur_correct_unknown total_correct_known += cur_correct_known total_correct = total_correct_unknown + total_correct_known total_upperbound_known += cur_upper_bound_known total_upperbound_unknown += cur_upper_bound_unknown for k,v in errors.iteritems(): total_error_stats[k] += v print "percent correct (total)", int(float(total_correct)/total*100) print "percent correct (known words)", int(float(total_correct_known)/total_known*100) print "percent (upper bound words)", int(float(total_upperbound_unknown + total_upperbound_known)/total*100) if total_unknown: print "percent correct (unknown words)", int(float(total_correct_unknown)/total_unknown*100) total_errors = sum([v for v in total_error_stats.values() ]) for k,v in total_error_stats.iteritems(): print "error count in {0} is {1}".format( k,v*100.0/total_errors ) num+=1 print "{0} file processed. {1}%".format(gold_file, num/(len(algo_files)+0.0)*100 ) return total_correct_known, total_correct_unknown, total_known, total_unknown, total_upperbound_known, total_upperbound_unknown
def process_ruscorpora(ruscorpora_dir, processed_dir, extension = "xml"): corpus_files = get_corpus_files(ruscorpora_dir) num = 0 for rnc_file_name in corpus_files: print "Starting file", rnc_file_name out_file = os.path.join( processed_dir, os.path.basename( rnc_file_name ).replace('.{0}'.format(extension), '.txt' ) ) if os.path.exists( out_file ): num +=1 continue process_ruscorpora_file( rnc_file_name, out_file ) num+=1 print "{0} file processed. {1}%".format(rnc_file_name, num/(len(corpus_files)+0.0)*100 )
def cross_validate(num_iters, algo_name, corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir): global _NAIVE_CV_GLOBALS corpus_files = get_corpus_files(corpus_dir) shuffle(corpus_files) splits = split_seq(corpus_files, num_iters) _NAIVE_CV_GLOBALS = [ corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name ] pool = multiprocessing.Pool() results = pool.map(cross_validate_inner, range(num_iters)) pool.close() pool.join() def summary(seq): q, s, n = 0.0, 0.0, 0.0 for x in seq: q += x * x s += x n += 1.0 avg = s / n dev = math.sqrt( q / n - avg ** 2 ) return avg *100 , dev * 100 prec = list( (tck + tcu) / (tk + tu) for tck, tcu, tk, tu, _, _ in results ) known_prec = list( tck / tk for tck, tcu, tk, tu, _, _ in results ) unknown_prec = list( tcu / tu for tck, tcu, tk, tu, _, _ in results ) ub_known = list( ubk / tk for tck, tcu, tk, tu, ubk, ubu in results ) ub_unknown = list( ubu / tu for tck, tcu, tk, tu, ubk, ubu in results ) print "RESULT: total precision: {0:.4f}% +- {1:.4f}%".format(*summary(prec)) print "RESULT: by-known precision: {0:.4f}% +- {1:.4f}%".format(*summary(known_prec)) print "RESULT: by-unknown precision: {0:.4f}% +- {1:.4f}%".format(*summary(unknown_prec)) print "RESULT: upper bound by knowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_known)) print "RESULT: upper bound by unknowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_unknown)) print "RESULT: " # Just a separator. print "RESULT: Finished {0} algorithm with {1} tagset".format( algo_name, get_tag_set_by_func(N_func ) ) print "RESULT: Raw: " + repr(results)