def read_dataset(self):

        sentence_tuple = namedtuple("SentenceTuple", ["dataset", "lp", "system", "sentence_num"])

        dataset_names = wmt.get_datasets(self.dir)
        for dataset_name in dataset_names:
            dataset = Dataset(dataset_name)
            lang_pairs = wmt.get_lang_pairs(self.dir, dataset_name)

            for lp in lang_pairs:
                if len(loads(self.cfg.get('Settings', 'language_pairs'))) > 0 and lp not in loads(self.cfg.get('Settings', 'language_pairs')):
                    continue

                if self.cfg.has_option('Settings', 'system_names'):
                    system_names = loads(self.cfg.get('Settings', 'system_names'))
                else:
                    system_names = wmt.get_system_names(self.dir, dataset_name, lp)

                number_sentences = wmt.sentences(wmt.reference_path(self.dir, dataset.name, lp))

                dataset.system_names[lp] = system_names
                dataset.number_sentences[lp] = number_sentences

                for system_name in system_names:
                    for sentence in range(wmt.sentences(wmt.reference_path(self.dir, dataset.name, lp))):
                        self.plain.append(sentence_tuple(dataset=dataset.name,
                                                         lp=lp,
                                                         system=system_name,
                                                         sentence_num=sentence + 1))

            self.datasets.append(dataset)
def write_parsed(input_dir, output_dir, lps):

    print("Copying parsed files to " + output_dir + ' ...')

    path_tgt = usr(output_dir + '/' + 'tgt' + '.parse')
    path_ref = usr(output_dir + '/' + 'ref' + '.parse')

    with codecs.open(path_tgt, 'w', 'utf8') as output_tgt:
        with codecs.open(path_ref, 'w', 'utf8') as output_ref:

            counter_tgt = 0
            counter_ref = 0

            for dataset in sorted(os.listdir(input_dir + '/' + 'references')):
                if dataset.startswith('.'):
                    continue

                for lp in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset)):
                    if lp.startswith('.'):
                        continue
                    if lp not in lps:
                        continue

                    with codecs.open(wmt.reference_path(input_dir, dataset, lp) + '.out', 'r', 'utf8') as input_ref:
                        ref_lines = input_ref.readlines()

                    for sys_file_name in sorted(os.listdir(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp)):
                        if sys_file_name.startswith('.'):
                            continue

                        with codecs.open(input_dir + '/' + 'system-outputs' + '/' + dataset + '/' + lp + '/' + sys_file_name, 'r', 'utf8') as input_sys:

                            for line in input_sys.readlines():
                                if line.startswith('Sentence #'):
                                    counter_tgt += 1
                                    output_tgt.write(wmt.substitute_line_number(line, counter_tgt))
                                else:
                                    output_tgt.write(line)

                            for line in ref_lines:
                                if line.startswith('Sentence #'):
                                    counter_ref += 1
                                    output_ref.write(wmt.substitute_line_number(line, counter_ref))
                                else:
                                    output_ref.write(line)
    def write_dataset(self, parsed=False, verbose=False):

        print("Copying dataset to " + self.cfg.get('Data', 'working_dir') + ' ...')

        path_tgt = usr(self.cfg.get('Data', 'working_dir') + '/' + 'tgt.txt')
        path_ref = usr(self.cfg.get('Data', 'working_dir') + '/' + 'ref.txt')

        counter_tgt = 0
        counter_ref = 0

        with codecs.open(path_tgt, 'w', 'utf8') as output_tgt:
            with codecs.open(path_ref, 'w', 'utf8') as output_ref:

                for dataset in self.datasets:
                    for lp in sorted(dataset.system_names.keys()):

                        with codecs.open(wmt.reference_path(self.dir, dataset.name, lp), 'r', 'utf8') as input_ref:
                            ref_lines = input_ref.readlines()

                        for sys_name in dataset.system_names[lp]:
                            counter_sys = 0
                            with codecs.open(wmt.system_path(self.dir, dataset.name, lp, sys_name), 'r', 'utf8') as input_sys:
                                for line in input_sys.readlines():
                                    counter_tgt += 1
                                    counter_sys += 1
                                    if parsed and line.startswith('Sentence #'):
                                        output_tgt.write(wmt.substitute_line_number(line, counter_tgt))
                                    else:
                                        if verbose:
                                            output_tgt.write('{}\t{}\t{}\t{}\t{}'.format(dataset.name,
                                                                                         lp,
                                                                                         sys_name,
                                                                                         counter_sys,
                                                                                         line))
                                        else:
                                            output_tgt.write(line)

                                for line in ref_lines:
                                    counter_ref += 1
                                    if parsed and line.startswith('Sentence #'):
                                        output_ref.write(wmt.substitute_line_number(line, counter_ref))
                                    else:
                                        output_ref.write(line)