def read_dataset(self): sentence_tuple = namedtuple("SentenceTuple", ["dataset", "lp", "system", "sentence_num"]) dataset_names = wmt.get_datasets(self.dir) for dataset_name in dataset_names: dataset = Dataset(dataset_name) lang_pairs = wmt.get_lang_pairs(self.dir, dataset_name) for lp in lang_pairs: if len(loads(self.cfg.get('Settings', 'language_pairs'))) > 0 and lp not in loads(self.cfg.get('Settings', 'language_pairs')): continue if self.cfg.has_option('Settings', 'system_names'): system_names = loads(self.cfg.get('Settings', 'system_names')) else: system_names = wmt.get_system_names(self.dir, dataset_name, lp) number_sentences = wmt.sentences(wmt.reference_path(self.dir, dataset.name, lp)) dataset.system_names[lp] = system_names dataset.number_sentences[lp] = number_sentences for system_name in system_names: for sentence in range(wmt.sentences(wmt.reference_path(self.dir, dataset.name, lp))): self.plain.append(sentence_tuple(dataset=dataset.name, lp=lp, system=system_name, sentence_num=sentence + 1)) self.datasets.append(dataset)
def write_parsed(input_dir, output_dir, lps): print("Copying parsed files to " + output_dir + ' ...') path_tgt = usr(output_dir + '/' + 'tgt' + '.parse') path_ref = usr(output_dir + '/' + 'ref' + '.parse') with codecs.open(path_tgt, 'w', 'utf8') as output_tgt: with codecs.open(path_ref, 'w', 'utf8') as output_ref: counter_tgt = 0 counter_ref = 0 for dataset in sorted(os.listdir(input_dir + '/' + 'references')): if dataset.startswith('.'): continue for lp in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset)): if lp.startswith('.'): continue if lp not in lps: continue with codecs.open(wmt.reference_path(input_dir, dataset, lp) + '.out', 'r', 'utf8') as input_ref: ref_lines = input_ref.readlines() for sys_file_name in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp)): if sys_file_name.startswith('.'): continue with codecs.open(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp + '/' + sys_file_name, 'r', 'utf8') as input_sys: for line in input_sys.readlines(): if line.startswith('Sentence #'): counter_tgt += 1 output_tgt.write(wmt.substitute_line_number(line, counter_tgt)) else: output_tgt.write(line) for line in ref_lines: if line.startswith('Sentence #'): counter_ref += 1 output_ref.write(wmt.substitute_line_number(line, counter_ref)) else: output_ref.write(line)
def write_dataset(self, parsed=False, verbose=False): print("Copying dataset to " + self.cfg.get('Data', 'working_dir') + ' ...') path_tgt = usr(self.cfg.get('Data', 'working_dir') + '/' + 'tgt.txt') path_ref = usr(self.cfg.get('Data', 'working_dir') + '/' + 'ref.txt') counter_tgt = 0 counter_ref = 0 with codecs.open(path_tgt, 'w', 'utf8') as output_tgt: with codecs.open(path_ref, 'w', 'utf8') as output_ref: for dataset in self.datasets: for lp in sorted(dataset.system_names.keys()): with codecs.open(wmt.reference_path(self.dir, dataset.name, lp), 'r', 'utf8') as input_ref: ref_lines = input_ref.readlines() for sys_name in dataset.system_names[lp]: counter_sys = 0 with codecs.open(wmt.system_path(self.dir, dataset.name, lp, sys_name), 'r', 'utf8') as input_sys: for line in input_sys.readlines(): counter_tgt += 1 counter_sys += 1 if parsed and line.startswith('Sentence #'): output_tgt.write(wmt.substitute_line_number(line, counter_tgt)) else: if verbose: output_tgt.write('{}\t{}\t{}\t{}\t{}'.format(dataset.name, lp, sys_name, counter_sys, line)) else: output_tgt.write(line) for line in ref_lines: counter_ref += 1 if parsed and line.startswith('Sentence #'): output_ref.write(wmt.substitute_line_number(line, counter_ref)) else: output_ref.write(line)