class InsideOutsideTest(TestCase): def setUp(self): w0 = ForestNode('John') w1 = ForestNode('saw') w2 = ForestNode('a') w3 = ForestNode('girl') w4 = ForestNode('with') w5 = ForestNode('a') w6 = ForestNode('telescope') t0_1 = ForestNode('NN') t1_2_0 = ForestNode('VB') t1_2_1 = ForestNode('NN') t2_3 = ForestNode('DT') t3_4 = ForestNode('NN') t4_5 = ForestNode('IN') t5_6 = ForestNode('DT') t6_7 = ForestNode('NN') t2_4 = ForestNode('NP') t5_7 = ForestNode('NP') t1_4 = ForestNode('VP') t4_7 = ForestNode('PP') t2_7 = ForestNode('NP') t1_7 = ForestNode('VP') root = ForestNode('S') # [NN,0,1] -> John e = ForestEdge() e.add_tail(w0) e.prob = 0.02 t0_1.add_incoming(e) # [VB,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_0.add_incoming(e) # [NN,1,2] -> saw e = ForestEdge() e.add_tail(w1) e.prob = 0.01 t1_2_1.add_incoming(e) # [DT,2,3] -> a e = ForestEdge() e.add_tail(w2) e.prob = 0.5 t2_3.add_incoming(e) # [NN,3,4] -> girl e = ForestEdge() e.add_tail(w3) e.prob = 0.05 t3_4.add_incoming(e) # [IN,4,5] -> with e = ForestEdge() e.add_tail(w4) e.prob = 0.25 t4_5.add_incoming(e) # [DT,5,6] -> a e = ForestEdge() e.add_tail(w5) e.prob = 0.5 t5_6.add_incoming(e) # [NN,6,7] -> telescope e = ForestEdge() e.add_tail(w6) e.prob = 0.001 t6_7.add_incoming(e) # [NP,2,4] -> [DT,2,3] [NN,3,4] e = ForestEdge() e.add_tail(t2_3) e.add_tail(t3_4) e.prob = 0.7 t2_4.add_incoming(e) # [NP,5,7] -> [DT,5,6] [NN,6,7] e = ForestEdge() e.add_tail(t5_6) e.add_tail(t6_7) e.prob = 0.7 t5_7.add_incoming(e) # [VP,1,4] -> [VB,1,2] [NP,2,4] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_4) e.prob = 0.9 t1_4.add_incoming(e) # [PP,4,7] -> [IN,4,5] [NP,5,7] e = ForestEdge() e.add_tail(t4_5) e.add_tail(t5_7) e.prob = 1.0 t4_7.add_incoming(e) # [NP,2,7] -> [NP,2,4] [PP,4,7] e = ForestEdge() e.add_tail(t2_4) e.add_tail(t4_7) e.prob = 0.3 t2_7.add_incoming(e) # [VP,1,7] -> [VB,1,2] [NP,2,7] e = ForestEdge() e.add_tail(t1_2_0) e.add_tail(t2_7) e.prob = 0.5 t1_7.add_incoming(e) # [VP,1,7] -> [VP,1,4] [PP,4,7] e = ForestEdge() e.add_tail(t1_4) e.add_tail(t4_7) e.prob = 0.5 t1_7.add_incoming(e) # [S,0,7] -> [NN,0,1] [VP,1,7] e = ForestEdge() e.add_tail(t0_1) e.add_tail(t1_7) e.prob = 0.9 root.add_incoming(e) self.hp = Hypergraph(root) def test_inside_outside(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.inside() self.hp.outside() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_exp_outside_exp(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_outside_log(self): self.hp.set_semiring(LOGPROB) self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None) self.hp.inside() self.hp.outside() logger.writeln(self.hp.dot()) # self.hp.show() def test_inside_exp_outside_exp_log(self): self.hp.set_semiring(LOGPROB) self.hp.set_functions(lambda x: log(x.prob), lambda x: 1, None) self.hp.inside() self.hp.outside() self.hp.inside_exp() self.hp.outside_exp() logger.writeln(self.hp.dot()) # self.hp.show() def test_best_paths(self): self.hp.set_semiring(INSIDE) self.hp.set_functions(lambda x: x.prob, lambda x: 1, None) self.hp.assert_done('topo_sort') logger.writeln(self.hp.root.best_paths()[0].tree_str()) logger.writeln(self.hp.root.best_paths()[0].weight) logger.writeln(self.hp.root.best_paths()[1].tree_str()) logger.writeln(self.hp.root.best_paths()[1].weight)
def run(self): # update per-sentence grammars, if there's any for g in self.grammars: g.update(self.id) self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w') if FLAGS.show_time: self.flog.write('running on %s\n\n' % socket.gethostname()) self.flog.flush() fwords = self.line.split() if FLAGS.preprocess: self.fidx2replacement = {} j = 0 for i, token in enumerate(fwords): if token in ('$number', '$date'): self.fidx2replacement[i] = self.special[j][1] j += 1 self.flog.write('[%s][%s words] %s\n' % (self.id, len(fwords), self.line)) decoder = Decoder(fwords, self.grammars, self.features) begin_time = time() if FLAGS.decoding_method == 'agenda': item = decoder.decode() elif FLAGS.decoding_method == 'cyk': item = decoder.decode_cyk() elif FLAGS.decoding_method == 'earley': item = decoder.decode_earley() else: assert False, '"%s" not valid decoding option' \ % FLAGS.decoding_method self.time = time() - begin_time if item is None: self.out = '[decoder failed to build a goal item]' else: hg = Hypergraph(item) hg.set_semiring(hypergraph.SHORTEST_PATH) hg.set_functions(lambda x: x.cost, None, None) hg.topo_sort() self.kbest = hg.root.best_paths() output_tokens = self.kbest[0].translation[:] if FLAGS.preprocess: for i in range(len(output_tokens)): if output_tokens[i] in ('$number', '$date'): fidx = self.kbest[0].composed_rule.we2f[i] if fidx is not None: output_tokens[i] = self.fidx2replacement[fidx] self.out = ' '.join(output_tokens[FLAGS.lm_order - 1:1 - FLAGS.lm_order]) self.hg = hg if FLAGS.output_hypergraph: self.write_hypergraph() self.flog.write('%s\n' % self.out) self.flog.write('\n') if item is not None: self.flog.write(self.kbest[0].tree_str()) self.flog.write('\n') self.flog.write(hg.stats()) self.flog.write('\n') self.flog.write(decoder.agenda_stats()) self.flog.write('\n') self.flog.write(decoder.chart.stats()) self.flog.write('\n') for dotchart in decoder.dotcharts: self.flog.write(dotchart.stats()) self.flog.write('\n') if FLAGS.show_time: timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time) self.flog.write(timeline) self.write_output_file() if FLAGS.output_kbest: self.write_kbest_to_file() self.flog.close()
def run(self): # update per-sentence grammars, if there's any for g in self.grammars: g.update(self.id) self.flog = open('%s/%s_%s' % (FLAGS.run_dir, 'log', self.suffix), 'w') if FLAGS.show_time: self.flog.write('running on %s\n\n' % socket.gethostname()) self.flog.flush() fwords = self.line.strip().split() # added by freesunshine, build the local grammar for oov words for each sentence rules = [] if self.oov_idx is not None and len(self.oov_idx) > 0: #oov_weight = 8.0 oov_weight = 0.0001 for idx in self.oov_idx: fw = fwords[idx] ew = "." rule_str = "[A0-0] ||| %s ||| %s ||| %lf %lf %lf" %(fw, ew, oov_weight, oov_weight, oov_weight) rr = Rule() rr.fromstr(rule_str) rules.append(rr) if self.ner_items is not None and len(self.ner_items) > 0: for item in self.ner_items: concept_weight = 10.0 st = item[0][0] ed = item[0][1] fw = ' '.join(fwords[st:ed]) #concept_weight *= pow((ed-st), 2) ew = item[1] value = int(ew[2]) #Here is the feature for difference of nonterminal type #concept_weight /= pow(1.4, value) #Here is the feature for the favor of longer spans #concept_weight *= pow(2, ed-st) #Here is the feature for the number of edges #concept_weight /= pow(2.0, get_num_edges(ew)) #print >>sys.stder, ew, concept_weight #rule_str = "[A1-1] ||| %s ||| %s ||| " % (fw, ew) rule_str = "%s ||| " % ew #weight = 5 if fw == ';': rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) else: rule_str += "%lf %lf %lf" % (concept_weight, concept_weight, concept_weight) rr = Rule() #print rule_str rr.fromstr(rule_str) rules.append(rr) #print '===== local_gr =====' #for r in rules: # print r local_gr = None if len(rules) > 0: local_gr = Grammar(FLAGS.rule_bin_size) local_gr.build(rules, self.grammars[0].features) if FLAGS.preprocess: self.fidx2replacement = {} j = 0 for i, token in enumerate(fwords): if token in ('$number', '$date'): self.fidx2replacement[i] = self.special[j][1] j += 1 self.flog.write('[%s][%s words] %s\n' % (self.id, len(fwords), self.line)) decoder = Decoder(fwords, self.grammars, self.features, local_gr) begin_time = time() if FLAGS.decoding_method == 'agenda': item = decoder.decode() elif FLAGS.decoding_method == 'cyk': item = decoder.decode_cyk() elif FLAGS.decoding_method == 'earley': item = decoder.decode_earley() else: assert False, '"%s" not valid decoding option' \ % FLAGS.decoding_method self.time = time() - begin_time if item is None: self.out = '[decoder failed to build a goal item]' else: ttt, succ = item item = ttt hg = Hypergraph(item) hg.set_semiring(hypergraph.SHORTEST_PATH) hg.set_functions(lambda x: x.cost, None, None) hg.topo_sort() self.kbest = hg.root.best_paths() #output_tokens = self.kbest[0].translation[:] #if FLAGS.preprocess: # for i in range(len(output_tokens)): # if output_tokens[i] in ('$number', '$date'): # fidx = self.kbest[0].composed_rule.we2f[i] # if fidx is not None: # output_tokens[i] = self.fidx2replacement[fidx] # @freesunshine target side string output #self.out = ' '.join(output_tokens[FLAGS.lm_order-1: # 1-FLAGS.lm_order]) self.flog.write('Decuction Tree:\n%s\n' % self.kbest[0].tree_str()) #self.out = str(self.kbest[0].translation) #if succ: self.out = self.kbest[0].translation.to_amr_format()[0] #else: # self.out = self.kbest[0].translation.toAMR() lines = [x.strip() for x in self.out.split('\n')] self.out = "".join(lines) self.hg = hg if FLAGS.output_hypergraph: self.write_hypergraph() self.flog.write('%s\n' % self.out) self.flog.write('\n') #if item is not None: # self.flog.write(self.kbest[0].tree_str()) # self.flog.write('\n') # self.flog.write(hg.stats()) # self.flog.write('\n') self.flog.write(decoder.agenda_stats()) self.flog.write('\n') self.flog.write(decoder.chart.stats()) self.flog.write('\n') for dotchart in decoder.dotcharts: self.flog.write(dotchart.stats()) self.flog.write('\n') if FLAGS.show_time: timeline = '{:<35}{:>15.2f}\n'.format('[time]:', self.time) self.flog.write(timeline) self.write_output_file() if FLAGS.output_kbest: self.write_kbest_to_file() self.flog.close()