input_sent = input_line.strip().split() output_items = output_line.strip().split('|') output_phrases = [oi.strip() for idx, oi in enumerate(output_items) if idx % 2 == 0 and oi.strip() != ''] output_sent = ' '.join(output_phrases).split() output_spans = get_output_phrase_as_spans(output_phrases) output_meta = [tuple(om.split(',wa=')) for idx, om in enumerate(output_items) if idx % 2 != 0] input_spans = [tuple([int(i) for i in om[0].split('-')]) for om in output_meta] wa_per_span = [[tuple([int(i) for i in a.split('-')]) for a in om[1].split()] for om in output_meta] input_tok_group = [-1] * len(input_sent) output_tok_group = [-1] * len(output_sent) sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n') sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n') coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None) coe_sentence.initial_order_by = VIS_LANG sent_idx += 1 assert len(wa_per_span) == len(input_spans) == len(output_spans) phrase_dict = {} input_coverage = [0] * len(input_sent) group_idx = 0 for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)): out_phrase = output_sent[out_span[0]:out_span[1] + 1] inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1] # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1] # print '\t phrase spans:', inp_span, '-', out_span # print '\twa:', wa wa_no_null = insert_epsilon_edge(wa, input_sent[inp_span[0]:inp_span[1] + 1], output_sent[out_span[0]:out_span[1] + 1]) sym_coverage, sym_wa = make_symmetric(wa_no_null) assert sym_coverage == 0
] input_spans = [ tuple([int(i) for i in om[0].split('-')]) for om in output_meta ] wa_per_span = [[ tuple([int(i) for i in a.split('-')]) for a in om[1].split() ] for om in output_meta] input_tok_group = [-1] * len(input_sent) output_tok_group = [-1] * len(output_sent) sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n') sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n') coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None) coe_sentence.initial_order_by = VIS_LANG sent_idx += 1 assert len(wa_per_span) == len(input_spans) == len(output_spans) phrase_dict = {} input_coverage = [0] * len(input_sent) group_idx = 0 for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)): out_phrase = output_sent[out_span[0]:out_span[1] + 1] inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1] # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1] # print '\t phrase spans:', inp_span, '-', out_span # print '\twa:', wa wa_no_null = insert_epsilon_edge( wa, input_sent[inp_span[0]:inp_span[1] + 1],