neighbor.append('*ST*') else: neighbor.append('*EN*') return neighbor ed = EditDistance(None) if __name__ == '__main__': # annotation_file = sys.argv[1] annotations = codecs.open('../web/annotation.txt', 'r', 'utf-8').read().strip() sentence_obj_list = [] for s_idx, sentence in enumerate(annotations.split('===')): if sentence.strip() != '': S = Sentence(s_idx, '', '', '') prev_matches = None for sent in sentence.split('\n'): if sent.strip() != '': action = sent.split(':')[1].strip() matches = re.findall(r'\[.*?\]', sent.strip()) matches = [m[1:-1] for m in matches] if action.strip() == '': S.graphs = [] nodes_in_visible_order = [] for m_idx, m in enumerate(matches): g = Graph(m_idx) S.graphs.append(g) for w_idx, w in enumerate(m.split()): n = Node(id=len(g.nodes), s=w,
tuple(om.split(',wa=')) for idx, om in enumerate(output_items) if idx % 2 != 0 ] input_spans = [ tuple([int(i) for i in om[0].split('-')]) for om in output_meta ] wa_per_span = [[ tuple([int(i) for i in a.split('-')]) for a in om[1].split() ] for om in output_meta] input_tok_group = [-1] * len(input_sent) output_tok_group = [-1] * len(output_sent) sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n') sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n') coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None) coe_sentence.initial_order_by = VIS_LANG sent_idx += 1 assert len(wa_per_span) == len(input_spans) == len(output_spans) phrase_dict = {} input_coverage = [0] * len(input_sent) group_idx = 0 for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)): out_phrase = output_sent[out_span[0]:out_span[1] + 1] inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1] # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1] # print '\t phrase spans:', inp_span, '-', out_span # print '\twa:', wa wa_no_null = insert_epsilon_edge(