Example #1
0
        input_sent = input_line.strip().split()
        output_items = output_line.strip().split('|')
        output_phrases = [oi.strip() for idx, oi in enumerate(output_items) if idx % 2 == 0 and oi.strip() != '']
        output_sent = ' '.join(output_phrases).split()
        output_spans = get_output_phrase_as_spans(output_phrases)
        output_meta = [tuple(om.split(',wa=')) for idx, om in enumerate(output_items) if idx % 2 != 0]
        input_spans = [tuple([int(i) for i in om[0].split('-')]) for om in output_meta]
        wa_per_span = [[tuple([int(i) for i in a.split('-')]) for a in om[1].split()] for om in output_meta]
        input_tok_group = [-1] * len(input_sent)
        output_tok_group = [-1] * len(output_sent)

        sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n')
        sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n')

        coe_sentence = Sentence(sent_idx, ' '.join(input_sent), ' '.join(output_sent), None)
        coe_sentence.initial_order_by = VIS_LANG
        sent_idx += 1
        assert len(wa_per_span) == len(input_spans) == len(output_spans)
        phrase_dict = {}
        input_coverage = [0] * len(input_sent)
        group_idx = 0
        for idx, (out_span, inp_span, wa) in enumerate(zip(output_spans, input_spans, wa_per_span)):
            out_phrase = output_sent[out_span[0]:out_span[1] + 1]
            inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1]
            # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1]
            # print '\t phrase spans:', inp_span, '-', out_span
            # print '\twa:', wa
            wa_no_null = insert_epsilon_edge(wa, input_sent[inp_span[0]:inp_span[1] + 1],
                                             output_sent[out_span[0]:out_span[1] + 1])
            sym_coverage, sym_wa = make_symmetric(wa_no_null)
            assert sym_coverage == 0
Example #2
0
        ]
        input_spans = [
            tuple([int(i) for i in om[0].split('-')]) for om in output_meta
        ]
        wa_per_span = [[
            tuple([int(i) for i in a.split('-')]) for a in om[1].split()
        ] for om in output_meta]
        input_tok_group = [-1] * len(input_sent)
        output_tok_group = [-1] * len(output_sent)

        sys.stderr.write('input sent:' + ' '.join(input_sent) + '\n')
        sys.stderr.write('output sent:' + ' '.join(output_sent) + '\n')

        coe_sentence = Sentence(sent_idx, ' '.join(input_sent),
                                ' '.join(output_sent), None)
        coe_sentence.initial_order_by = VIS_LANG
        sent_idx += 1
        assert len(wa_per_span) == len(input_spans) == len(output_spans)
        phrase_dict = {}
        input_coverage = [0] * len(input_sent)
        group_idx = 0
        for idx, (out_span, inp_span,
                  wa) in enumerate(zip(output_spans, input_spans,
                                       wa_per_span)):
            out_phrase = output_sent[out_span[0]:out_span[1] + 1]
            inp_phrase = input_sent[inp_span[0]:inp_span[1] + 1]
            # print '\t phrases:', input_sent[inp_span[0]:inp_span[1] + 1], '-', output_sent[out_span[0]:out_span[1] + 1]
            # print '\t phrase spans:', inp_span, '-', out_span
            # print '\twa:', wa
            wa_no_null = insert_epsilon_edge(
                wa, input_sent[inp_span[0]:inp_span[1] + 1],