def run(sentence, tagset, preterms, start, hmm_prob, pcfg, pcfg_u): max_iterations = 20 step_size = 1.0 n = len(sentence) u = defaultdict() # dual decomposition parameter init_dd_param(u, n, tagset) k = 0 # number of iterations while k < max_iterations: tags = viterbi.run(sentence, tagset, hmm_prob, u) parse = fast_cky.run(sentence, preterms, start, pcfg, pcfg_u, u) parse_list = utils.make_parse_list(parse) terms, parse_tags = utils.get_terminals_tags(parse_list) print k print tags, "tagger" print parse_tags, "parser" if agree(parse_tags, tags): return k, tags, parse # converges in the kth iteration y = compute_indicators(tags, tagset) z = compute_indicators(parse_tags, tagset) k += 1 step_size = 1.0/k update(y, z, u, step_size) return -1, tags, parse # does not converge
def get_sentences(dev_file): dev = open(dev_file, 'r') sentences = [] while 1: line = dev.readline() if not line: break line = line.strip() parse = utils.make_parse_list(line) sentence, tags = utils.get_terminals_tags(parse) sentences.append(sentence) return sentences
def extract_tagging_data(treebank, st_filename, wt_filename): sentences = [] tagseqs = [] parses = read_parses_no_indent(treebank) for parse in parses: parse_list = utils.make_parse_list(parse) sentence, truetag = utils.get_terminals_tags(parse_list) sentences.append(sentence) tagseqs.append(truetag) write_tagging_data(sentences, tagseqs, st_filename, wt_filename) return sentences, tagseqs
def learn(parses): emission_counts = defaultdict() transition_counts = defaultdict() tag_counts = defaultdict() for parse in parses: parse_list = utils.make_parse_list(parse) update_counts(parse_list, emission_counts, transition_counts, tag_counts) # I'm not doing smoothing because smoothing gives very bad results # Every -RARE- word gets assigned to the FW tag, and then all following tags are FW. # Because FW->-RARE- and FW->FW have high probabilities # emission_counts = smooth_emission(emission_counts, tag_counts) # I don't like this! Why won't u work otherwise, Python? set_hmm_params(emission_counts, transition_counts, tag_counts) check_if_prob_dist(emission_counts) check_if_prob_dist(transition_counts) write_hmm_params(emission_counts, transition_counts, tag_counts) #write_for_java(emission_counts, transition_counts, tag_counts) return emission_counts, transition_counts
def execute(treebank, dev): print "reading treebank..." parses = utils.read_parses_no_indent(treebank) parse_lists = [] for parse in parses: parse_lists.append(utils.make_parse_list(parse)) print "learning pcfg..." nonterms, terms, start, prob = grammar.learn(parse_lists) print "learning hmm..." emission, transition = sequnece_labeler.learn(parse_lists) print "reading dev data..." dev_sentences = utils.get_sentences(dev) print dev_sentences[100] for sentence in dev_sentences: parse = cky.run(sentence, nonterms, start, prob) sequnece = viterbi.run(sentence, emission, transition)
def quick_execute(dev): print "loading learnt parameters..." pcfg_prob, nonterms, start = cky.get_pcfg() hmm, tagset = viterbi.get_hmm_tagset() print "reading dev data..." parses = utils.read_parses_no_indent(dev) i = 0 for parse in parses: if len(parse) > 100: parse_list = utils.make_parse_list(parse) sentence, truetags = utils.get_terminals_tags(parse_list) print '\n', sentence, '\n' #print dev_sentences.index(sentence) print "running dual decomposition..." num_iterations = dd_parser_tagger.run(sentence, pcfg_prob, nonterms, start, tagset, hmm) print "\n", truetags, " :true tags" if num_iterations != -1: print "converges in ", num_iterations ," iterations \n" else: print "does not converge :(\n"
terms = set([]) nonterms = set([]) start = '**' for parse_list in parse_lists: if parse_lists.index(parse_list)% 1000 == 0: print parse_lists.index(parse_list), '...' tree, terms, nonterms = extract_cfg(parse_list, terms, nonterms, start) cnf_tree = cnf(tree, start, nonterms) rule_counts, parent_counts = update_counts(cnf_tree, start, rule_counts, parent_counts) # python globals??? prob = write_pcfg_probabilities(rule_counts, parent_counts) write_terms_nonterms(terms, nonterms, start) #sequence_labeler.update_counts(parse_list, emission_counts, transition_counts) return nonterms, terms, start, prob if __name__ == "__main__": treebank = sys.argv[1] print "reading treebank..." parses = utils.read_parses_no_indent(treebank) parse_lists = [] for parse in parses: parse_lists.append(utils.make_parse_list(parse)) print "learning pcfg..." nonterms, terms, start, prob = learn(parse_lists)