Esempio n. 1
0
    def __call__(self):
        src_file = utils.open_file(self._file.files["src"])
        tgt_file = utils.open_file(self._file.files["tgt"])
        annotations = {
            key: utils.open_file(path)
            for key, path in self._file.files.get("annotations", {}).items()
        }

        def _get_samples():
            for i in range(self._file.lines_count):
                src_line = src_file.readline()
                tgt_line = tgt_file.readline()
                annot_lines = {}
                for key, annot_file in annotations.items():
                    annot_lines[key] = annot_file.readline()

                num_samples = self._file.random_sample.get(i, 0)
                if num_samples == 0:
                    continue

                src_line = src_line.strip()
                tgt_line = tgt_line.strip()
                for key, line in annot_lines.items():
                    annot_lines[key] = line.strip()

                while num_samples > 0:
                    yield tu.TranslationUnit(source=src_line,
                                             target=tgt_line,
                                             annotations=annot_lines)
                    num_samples -= 1

        try:
            batch_meta = {
                "base_name": self._file.base_name,
                "label": self._file.label,
                "no_preprocess": self._file.no_preprocess,
                "pattern": self._file.pattern,
                "root": self._file.root,
                "weight": self._file.weight,
            }

            if self._oversample_as_weights:
                batch_meta["example_weights"] = self._file.oversample

            tu_list = []

            for sample_tu in _get_samples():
                tu_list.append(sample_tu)
                if self._batch_size is not None and len(
                        tu_list) == self._batch_size:
                    yield tu_list, batch_meta.copy()
                    tu_list = []

            if tu_list:
                yield tu_list, batch_meta.copy()
        finally:
            src_file.close()
            tgt_file.close()
            for f in annotations.values():
                f.close()
Esempio n. 2
0
    def __call__(self):
        files = [utils.open_file(path) for path in self._files]

        try:
            tu_list = []

            # Postprocess.
            if len(self._files) > 1:
                for meta in self._metadata:

                    # TODO : prefix, features
                    num_parts = len(meta)
                    src_lines = [
                        next(files[0]).strip().split()
                        for _ in range(num_parts)
                    ]
                    tgt_lines = [
                        next(files[1]).strip().split()
                        for _ in range(num_parts)
                    ]

                    tu_list.append(
                        tu.TranslationUnit(
                            source=src_lines,
                            target=tgt_lines,
                            metadata=meta,
                            source_tokenizer=self._source_tokenizer,
                            target_tokenizer=self._target_tokenizer))

                    if len(tu_list) == self._batch_size:
                        yield tu_list, {}
                        tu_list = []

            # Preprocess.
            else:
                for line in files[0]:
                    tu_list.append(tu.TranslationUnit(source=line))
                    if len(tu_list) == self._batch_size:
                        yield tu_list, {}
                        tu_list = []

            if tu_list:
                yield tu_list, {}
        finally:
            for f in files:
                f.close()
Esempio n. 3
0
    def __call__(self):
        if not self._input_paths:
            raise RuntimeError("No files have been registered")
        files = {
            name: utils.open_file(path) for name, path in self._input_paths.items()
        }

        try:
            tu_list = []

            for unit in self._get_translation_units(files):
                tu_list.append(unit)
                if len(tu_list) == self._batch_size:
                    yield tu_list, self._batch_meta.copy()
                    tu_list = []

            if tu_list:
                yield tu_list, self._batch_meta.copy()
        finally:
            for f in files.values():
                f.close()
Esempio n. 4
0
def generate_pseudo_corpus(corpus_dir, size, name, suffix):
    path = str(corpus_dir.join(name + "." + suffix))
    with utils.open_file(path, "wb") as f:
        for i in range(size):
            f.write((name + " " + str(i) + "\n").encode("utf-8"))
    return path