Esempio n. 1
0
def write_parsed(input_dir, output_dir, lps):

    print("Copying parsed files to " + output_dir + ' ...')

    path_tgt = usr(output_dir + '/' + 'tgt' + '.parse')
    path_ref = usr(output_dir + '/' + 'ref' + '.parse')

    with codecs.open(path_tgt, 'w', 'utf8') as output_tgt:
        with codecs.open(path_ref, 'w', 'utf8') as output_ref:

            counter_tgt = 0
            counter_ref = 0

            for dataset in sorted(os.listdir(input_dir + '/' + 'references')):
                if dataset.startswith('.'):
                    continue

                for lp in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset)):
                    if lp.startswith('.'):
                        continue
                    if lp not in lps:
                        continue

                    with codecs.open(wmt.reference_path(input_dir, dataset, lp) + '.out', 'r', 'utf8') as input_ref:
                        ref_lines = input_ref.readlines()

                    for sys_file_name in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp)):
                        if sys_file_name.startswith('.'):
                            continue

                        with codecs.open(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp + '/' + sys_file_name, 'r', 'utf8') as input_sys:

                            for line in input_sys.readlines():
                                if line.startswith('Sentence #'):
                                    counter_tgt += 1
                                    output_tgt.write(wmt.substitute_line_number(line, counter_tgt))
                                else:
                                    output_tgt.write(line)

                            for line in ref_lines:
                                if line.startswith('Sentence #'):
                                    counter_ref += 1
                                    output_ref.write(wmt.substitute_line_number(line, counter_ref))
                                else:
                                    output_ref.write(line)
Esempio n. 2
0
    def write_dataset(self, parsed=False, verbose=False):

        print("Copying dataset to " + self.cfg.get('Data', 'working_dir') + ' ...')

        path_tgt = usr(self.cfg.get('Data', 'working_dir') + '/' + 'tgt.txt')
        path_ref = usr(self.cfg.get('Data', 'working_dir') + '/' + 'ref.txt')

        counter_tgt = 0
        counter_ref = 0

        with codecs.open(path_tgt, 'w', 'utf8') as output_tgt:
            with codecs.open(path_ref, 'w', 'utf8') as output_ref:

                for dataset in self.datasets:
                    for lp in sorted(dataset.system_names.keys()):

                        with codecs.open(wmt.reference_path(self.dir, dataset.name, lp), 'r', 'utf8') as input_ref:
                            ref_lines = input_ref.readlines()

                        for sys_name in dataset.system_names[lp]:
                            counter_sys = 0
                            with codecs.open(wmt.system_path(self.dir, dataset.name, lp, sys_name), 'r', 'utf8') as input_sys:
                                for line in input_sys.readlines():
                                    counter_tgt += 1
                                    counter_sys += 1
                                    if parsed and line.startswith('Sentence #'):
                                        output_tgt.write(wmt.substitute_line_number(line, counter_tgt))
                                    else:
                                        if verbose:
                                            output_tgt.write('{}\t{}\t{}\t{}\t{}'.format(dataset.name,
                                                                                         lp,
                                                                                         sys_name,
                                                                                         counter_sys,
                                                                                         line))
                                        else:
                                            output_tgt.write(line)

                                for line in ref_lines:
                                    counter_ref += 1
                                    if parsed and line.startswith('Sentence #'):
                                        output_ref.write(wmt.substitute_line_number(line, counter_ref))
                                    else:
                                        output_ref.write(line)