def __call__(self): src_file = utils.open_file(self._file.files["src"]) tgt_file = utils.open_file(self._file.files["tgt"]) annotations = { key: utils.open_file(path) for key, path in self._file.files.get("annotations", {}).items() } def _get_samples(): for i in range(self._file.lines_count): src_line = src_file.readline() tgt_line = tgt_file.readline() annot_lines = {} for key, annot_file in annotations.items(): annot_lines[key] = annot_file.readline() num_samples = self._file.random_sample.get(i, 0) if num_samples == 0: continue src_line = src_line.strip() tgt_line = tgt_line.strip() for key, line in annot_lines.items(): annot_lines[key] = line.strip() while num_samples > 0: yield tu.TranslationUnit(source=src_line, target=tgt_line, annotations=annot_lines) num_samples -= 1 try: batch_meta = { "base_name": self._file.base_name, "label": self._file.label, "no_preprocess": self._file.no_preprocess, "pattern": self._file.pattern, "root": self._file.root, "weight": self._file.weight, } if self._oversample_as_weights: batch_meta["example_weights"] = self._file.oversample tu_list = [] for sample_tu in _get_samples(): tu_list.append(sample_tu) if self._batch_size is not None and len( tu_list) == self._batch_size: yield tu_list, batch_meta.copy() tu_list = [] if tu_list: yield tu_list, batch_meta.copy() finally: src_file.close() tgt_file.close() for f in annotations.values(): f.close()
def __call__(self): files = [utils.open_file(path) for path in self._files] try: tu_list = [] # Postprocess. if len(self._files) > 1: for meta in self._metadata: # TODO : prefix, features num_parts = len(meta) src_lines = [ next(files[0]).strip().split() for _ in range(num_parts) ] tgt_lines = [ next(files[1]).strip().split() for _ in range(num_parts) ] tu_list.append( tu.TranslationUnit( source=src_lines, target=tgt_lines, metadata=meta, source_tokenizer=self._source_tokenizer, target_tokenizer=self._target_tokenizer)) if len(tu_list) == self._batch_size: yield tu_list, {} tu_list = [] # Preprocess. else: for line in files[0]: tu_list.append(tu.TranslationUnit(source=line)) if len(tu_list) == self._batch_size: yield tu_list, {} tu_list = [] if tu_list: yield tu_list, {} finally: for f in files: f.close()
def __call__(self): if not self._input_paths: raise RuntimeError("No files have been registered") files = { name: utils.open_file(path) for name, path in self._input_paths.items() } try: tu_list = [] for unit in self._get_translation_units(files): tu_list.append(unit) if len(tu_list) == self._batch_size: yield tu_list, self._batch_meta.copy() tu_list = [] if tu_list: yield tu_list, self._batch_meta.copy() finally: for f in files.values(): f.close()
def generate_pseudo_corpus(corpus_dir, size, name, suffix): path = str(corpus_dir.join(name + "." + suffix)) with utils.open_file(path, "wb") as f: for i in range(size): f.write((name + " " + str(i) + "\n").encode("utf-8")) return path