def train_B_corpus(corpus_dir, N_filter_func = N_default):
    corpus_files = get_corpus_files( corpus_dir )
    B = defaultdict(default_float)
    for file in corpus_files:
        print "Train B matrix on {0} file".format( file )
        calculate_B(B,file, N_filter_func )
    B = normalize_B(B)
    return B
def train_A_corpus(corpus_dir, N_filter_func = N_default):
    corpus_files = get_corpus_files( corpus_dir )
    A = defaultdict(default_float)
    p = defaultdict(float)
    for file in corpus_files:
        print "Train A matrix on {0} file".format( file )
        calculate_A(A,p,file, N_filter_func )
    A,p = normalize_A_matrix(A,p)
    return A,p
Example #3
0
def calculate_dir_precision(algo_dir, gold_dir, ambi_dir, M , N, P, errors_context_filename):

    algo_files = get_corpus_files(algo_dir)

    num = 0
    total_unknown = 0
    total_known = 0
    total_correct_unknown = 0
    total_correct_known = 0
    total_upperbound_known = 0
    total_upperbound_unknown = 0

    total_error_stats = defaultdict(float)
    for algo_file in algo_files:
        print "Evaluating file {0}".format( algo_file )
        ambi_file = os.path.join( ambi_dir, os.path.basename( algo_file ) )
        gold_file = os.path.join( gold_dir, os.path.basename( algo_file ) )

        if os.path.exists( algo_file ) and os.path.exists( gold_file ):
            cur_correct_unknown, cur_correct_known, cur_total_unknown, cur_total_known,errors, cur_upper_bound_unknown,cur_upper_bound_known  = calculate_precision( file_algo_name= algo_file, file_gold_standart_name= gold_file,
                file_ambi_name = ambi_file, M=M , N=N, P=P,
                errors_context_filename = errors_context_filename)
            total_unknown += cur_total_unknown
            total_known += cur_total_known
            total = total_unknown + total_known
            total_correct_unknown += cur_correct_unknown
            total_correct_known += cur_correct_known
            total_correct = total_correct_unknown + total_correct_known
            total_upperbound_known += cur_upper_bound_known
            total_upperbound_unknown += cur_upper_bound_unknown

            for k,v in errors.iteritems():
                total_error_stats[k] += v

            print "percent correct (total)", int(float(total_correct)/total*100)
            print "percent correct (known words)", int(float(total_correct_known)/total_known*100)

            print "percent (upper bound words)", int(float(total_upperbound_unknown + total_upperbound_known)/total*100)


            if total_unknown:
                print "percent correct (unknown words)", int(float(total_correct_unknown)/total_unknown*100)

            total_errors = sum([v for v in total_error_stats.values() ])
            for k,v in total_error_stats.iteritems():
                print "error count in {0} is {1}".format( k,v*100.0/total_errors )

            num+=1
            print "{0} file processed. {1}%".format(gold_file, num/(len(algo_files)+0.0)*100 )
    return total_correct_known, total_correct_unknown, total_known, total_unknown, total_upperbound_known, total_upperbound_unknown
def process_ruscorpora(ruscorpora_dir, processed_dir, extension = "xml"):

    corpus_files = get_corpus_files(ruscorpora_dir)

    num = 0
    for rnc_file_name in corpus_files:
        print "Starting file", rnc_file_name


        out_file = os.path.join( processed_dir, os.path.basename( rnc_file_name ).replace('.{0}'.format(extension), '.txt' ) )

        if os.path.exists( out_file ):
            num +=1
            continue


        process_ruscorpora_file( rnc_file_name, out_file )
        num+=1
        print "{0} file processed. {1}%".format(rnc_file_name, num/(len(corpus_files)+0.0)*100 )
Example #5
0
def cross_validate(num_iters, algo_name, corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir):
    global _NAIVE_CV_GLOBALS

    corpus_files = get_corpus_files(corpus_dir)
    shuffle(corpus_files)
    splits = split_seq(corpus_files, num_iters)

    _NAIVE_CV_GLOBALS = [ corpus_dir, algo_dir, morph_analysis_dir, N_func, error_dir, num_iters, corpus_files, splits, algo_name ]

    pool = multiprocessing.Pool()
    results = pool.map(cross_validate_inner, range(num_iters))
    pool.close()
    pool.join()

    def summary(seq):
        q, s, n = 0.0, 0.0, 0.0
        for x in seq:
            q += x * x
            s += x
            n += 1.0
        avg = s / n
        dev = math.sqrt( q / n - avg ** 2 )
        return avg *100 , dev * 100

    prec         = list( (tck + tcu) / (tk + tu) for tck, tcu, tk, tu, _, _ in results )
    known_prec   = list( tck / tk                for tck, tcu, tk, tu, _, _ in results )
    unknown_prec = list( tcu / tu                for tck, tcu, tk, tu, _, _ in results )

    ub_known     = list( ubk / tk for tck, tcu, tk, tu, ubk, ubu in results )
    ub_unknown   = list( ubu / tu for tck, tcu, tk, tu, ubk, ubu in results )

    print "RESULT:         total precision: {0:.4f}% +- {1:.4f}%".format(*summary(prec))
    print "RESULT:      by-known precision: {0:.4f}% +- {1:.4f}%".format(*summary(known_prec))
    print "RESULT:    by-unknown precision: {0:.4f}% +- {1:.4f}%".format(*summary(unknown_prec))
    print "RESULT:   upper bound by knowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_known))
    print "RESULT: upper bound by unknowns: {0:.4f}% +- {1:.4f}%".format(*summary(ub_unknown))
    print "RESULT: " # Just a separator.
    print "RESULT: Finished {0} algorithm with {1} tagset".format( algo_name, get_tag_set_by_func(N_func ) )
    print "RESULT: Raw: " + repr(results)