def update_counts(parse_list, emission_counts, transition_counts, tag_counts): terminals, tags = utils.get_terminals_tags(parse_list) # updating emission counts for i in xrange(0,len(terminals)): emission_key = tags[i] + '~>' + terminals[i] if emission_key in emission_counts: emission_counts[emission_key] += 1 else: emission_counts[emission_key] = 1 tags.insert(0, '*') tags.append('STOP') # updating transition counts for i in xrange(1, len(tags)): transition_key = tags[i-1] + '~>' + tags[i] if transition_key in transition_counts: transition_counts[transition_key] += 1 else: transition_counts[transition_key] = 1 # updating tag counts for tag in tags: if tag in tag_counts: tag_counts[tag] += 1 else: tag_counts[tag] = 1
def run(sentence, tagset, preterms, start, hmm_prob, pcfg, pcfg_u): max_iterations = 20 step_size = 1.0 n = len(sentence) u = defaultdict() # dual decomposition parameter init_dd_param(u, n, tagset) k = 0 # number of iterations while k < max_iterations: tags = viterbi.run(sentence, tagset, hmm_prob, u) parse = fast_cky.run(sentence, preterms, start, pcfg, pcfg_u, u) parse_list = utils.make_parse_list(parse) terms, parse_tags = utils.get_terminals_tags(parse_list) print k print tags, "tagger" print parse_tags, "parser" if agree(parse_tags, tags): return k, tags, parse # converges in the kth iteration y = compute_indicators(tags, tagset) z = compute_indicators(parse_tags, tagset) k += 1 step_size = 1.0/k update(y, z, u, step_size) return -1, tags, parse # does not converge
def get_sentences(dev_file): dev = open(dev_file, 'r') sentences = [] while 1: line = dev.readline() if not line: break line = line.strip() parse = utils.make_parse_list(line) sentence, tags = utils.get_terminals_tags(parse) sentences.append(sentence) return sentences
def extract_tagging_data(treebank, st_filename, wt_filename): sentences = [] tagseqs = [] parses = read_parses_no_indent(treebank) for parse in parses: parse_list = utils.make_parse_list(parse) sentence, truetag = utils.get_terminals_tags(parse_list) sentences.append(sentence) tagseqs.append(truetag) write_tagging_data(sentences, tagseqs, st_filename, wt_filename) return sentences, tagseqs
def quick_execute(dev): print "loading learnt parameters..." pcfg_prob, nonterms, start = cky.get_pcfg() hmm, tagset = viterbi.get_hmm_tagset() print "reading dev data..." parses = utils.read_parses_no_indent(dev) i = 0 for parse in parses: if len(parse) > 100: parse_list = utils.make_parse_list(parse) sentence, truetags = utils.get_terminals_tags(parse_list) print '\n', sentence, '\n' #print dev_sentences.index(sentence) print "running dual decomposition..." num_iterations = dd_parser_tagger.run(sentence, pcfg_prob, nonterms, start, tagset, hmm) print "\n", truetags, " :true tags" if num_iterations != -1: print "converges in ", num_iterations ," iterations \n" else: print "does not converge :(\n"