def run(sentence, tagset, preterms, start, hmm_prob, pcfg, pcfg_u):
    max_iterations = 20
    step_size = 1.0

    n = len(sentence)

    u = defaultdict() # dual decomposition parameter
    init_dd_param(u, n, tagset)

    k = 0 # number of iterations
    while k < max_iterations:
          
        tags = viterbi.run(sentence, tagset, hmm_prob, u)

        parse = fast_cky.run(sentence, preterms, start, pcfg, pcfg_u, u)
        parse_list = utils.make_parse_list(parse)
        terms, parse_tags = utils.get_terminals_tags(parse_list)

        print k
        print tags, "tagger"
        print parse_tags, "parser"

        if agree(parse_tags, tags):
            return k, tags, parse  # converges in the kth iteration
        
        y = compute_indicators(tags, tagset)
        z = compute_indicators(parse_tags, tagset)
        k += 1
        step_size = 1.0/k
        update(y, z, u, step_size)

    return -1, tags, parse # does not converge
def get_sentences(dev_file):
    dev = open(dev_file, 'r')
    sentences = []
    while 1:
        line = dev.readline()
        if not line:
            break
        line = line.strip()
        parse = utils.make_parse_list(line)
        sentence, tags = utils.get_terminals_tags(parse)
        sentences.append(sentence)
    return sentences
def extract_tagging_data(treebank, st_filename, wt_filename):
    sentences = []
    tagseqs = []

    parses = read_parses_no_indent(treebank)
    for parse in parses:
        parse_list = utils.make_parse_list(parse)
        sentence, truetag = utils.get_terminals_tags(parse_list)
        sentences.append(sentence)
        tagseqs.append(truetag)

    write_tagging_data(sentences, tagseqs, st_filename, wt_filename)
    return sentences, tagseqs
def learn(parses):
    emission_counts = defaultdict()
    transition_counts = defaultdict()
    tag_counts = defaultdict()

    for parse in parses:
        parse_list = utils.make_parse_list(parse)
        update_counts(parse_list, emission_counts, transition_counts, tag_counts)
#   I'm not doing smoothing because smoothing gives very bad results
#   Every -RARE- word gets assigned to the FW tag, and then all following tags are FW. 
#   Because FW->-RARE- and FW->FW have high probabilities
#   emission_counts = smooth_emission(emission_counts, tag_counts) # I don't like this! Why won't u work otherwise, Python?

    set_hmm_params(emission_counts, transition_counts, tag_counts)
    check_if_prob_dist(emission_counts)
    check_if_prob_dist(transition_counts)
    write_hmm_params(emission_counts, transition_counts, tag_counts)
    #write_for_java(emission_counts, transition_counts, tag_counts)
    return emission_counts, transition_counts
def execute(treebank, dev):
    print "reading treebank..."
    parses = utils.read_parses_no_indent(treebank)
    parse_lists = []
    for parse in parses:
        parse_lists.append(utils.make_parse_list(parse))
      
    print "learning pcfg..."  
    nonterms, terms, start, prob = grammar.learn(parse_lists)
    
    print "learning hmm..."
    emission, transition = sequnece_labeler.learn(parse_lists)

    print "reading dev data..."
    dev_sentences = utils.get_sentences(dev)
    print dev_sentences[100] 
    for sentence in dev_sentences:
        parse = cky.run(sentence, nonterms, start, prob)
        sequnece = viterbi.run(sentence, emission, transition)
def quick_execute(dev):
    print "loading learnt parameters..."
    pcfg_prob, nonterms, start = cky.get_pcfg()
    hmm, tagset = viterbi.get_hmm_tagset()

    print "reading dev data..."
    parses = utils.read_parses_no_indent(dev)

    i = 0
    for parse in parses:
        if len(parse) > 100:
            parse_list = utils.make_parse_list(parse)
            sentence, truetags = utils.get_terminals_tags(parse_list)
            print '\n', sentence, '\n'
            #print dev_sentences.index(sentence)
            print "running dual decomposition..."
            num_iterations = dd_parser_tagger.run(sentence, pcfg_prob, nonterms, start, tagset, hmm)
            print "\n", truetags, " :true tags"
            if num_iterations != -1:
                print "converges in ", num_iterations ," iterations \n"
            else:
                print "does not converge :(\n"
Beispiel #7
0
    terms = set([])
    nonterms = set([])
    start = '**'

    for parse_list in parse_lists:
        if parse_lists.index(parse_list)% 1000 == 0:
           print parse_lists.index(parse_list), '...'

        tree, terms, nonterms = extract_cfg(parse_list, terms, nonterms, start)
        cnf_tree = cnf(tree, start, nonterms)
        rule_counts, parent_counts = update_counts(cnf_tree, start, rule_counts, parent_counts) # python globals???
    
    prob = write_pcfg_probabilities(rule_counts, parent_counts)
    write_terms_nonterms(terms, nonterms, start)
    #sequence_labeler.update_counts(parse_list, emission_counts, transition_counts)
        
    return nonterms, terms, start, prob

if __name__ == "__main__":
    treebank = sys.argv[1]
    
    print "reading treebank..."
    parses = utils.read_parses_no_indent(treebank)
    parse_lists = []
    for parse in parses:
        parse_lists.append(utils.make_parse_list(parse))

    print "learning pcfg..."
    nonterms, terms, start, prob = learn(parse_lists)