def write_parsed(input_dir, output_dir, lps): print("Copying parsed files to " + output_dir + ' ...') path_tgt = usr(output_dir + '/' + 'tgt' + '.parse') path_ref = usr(output_dir + '/' + 'ref' + '.parse') with codecs.open(path_tgt, 'w', 'utf8') as output_tgt: with codecs.open(path_ref, 'w', 'utf8') as output_ref: counter_tgt = 0 counter_ref = 0 for dataset in sorted(os.listdir(input_dir + '/' + 'references')): if dataset.startswith('.'): continue for lp in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset)): if lp.startswith('.'): continue if lp not in lps: continue with codecs.open(wmt.reference_path(input_dir, dataset, lp) + '.out', 'r', 'utf8') as input_ref: ref_lines = input_ref.readlines() for sys_file_name in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp)): if sys_file_name.startswith('.'): continue with codecs.open(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp + '/' + sys_file_name, 'r', 'utf8') as input_sys: for line in input_sys.readlines(): if line.startswith('Sentence #'): counter_tgt += 1 output_tgt.write(wmt.substitute_line_number(line, counter_tgt)) else: output_tgt.write(line) for line in ref_lines: if line.startswith('Sentence #'): counter_ref += 1 output_ref.write(wmt.substitute_line_number(line, counter_ref)) else: output_ref.write(line)
def write_dataset(self, parsed=False, verbose=False): print("Copying dataset to " + self.cfg.get('Data', 'working_dir') + ' ...') path_tgt = usr(self.cfg.get('Data', 'working_dir') + '/' + 'tgt.txt') path_ref = usr(self.cfg.get('Data', 'working_dir') + '/' + 'ref.txt') counter_tgt = 0 counter_ref = 0 with codecs.open(path_tgt, 'w', 'utf8') as output_tgt: with codecs.open(path_ref, 'w', 'utf8') as output_ref: for dataset in self.datasets: for lp in sorted(dataset.system_names.keys()): with codecs.open(wmt.reference_path(self.dir, dataset.name, lp), 'r', 'utf8') as input_ref: ref_lines = input_ref.readlines() for sys_name in dataset.system_names[lp]: counter_sys = 0 with codecs.open(wmt.system_path(self.dir, dataset.name, lp, sys_name), 'r', 'utf8') as input_sys: for line in input_sys.readlines(): counter_tgt += 1 counter_sys += 1 if parsed and line.startswith('Sentence #'): output_tgt.write(wmt.substitute_line_number(line, counter_tgt)) else: if verbose: output_tgt.write('{}\t{}\t{}\t{}\t{}'.format(dataset.name, lp, sys_name, counter_sys, line)) else: output_tgt.write(line) for line in ref_lines: counter_ref += 1 if parsed and line.startswith('Sentence #'): output_ref.write(wmt.substitute_line_number(line, counter_ref)) else: output_ref.write(line)