def translate(opt): ArgumentParser.validate_translate_opts(opt) ArgumentParser._get_all_transform_translate(opt) ArgumentParser._validate_transforms_opts(opt) ArgumentParser.validate_translate_opts_dynamic(opt) logger = init_logger(opt.log_file) translator = build_translator(opt, logger=logger, report_score=True) data_reader = InferenceDataReader(opt.src, opt.tgt, opt.src_feats) # Build transforms transforms_cls = get_transforms_cls(opt._all_transform) transforms = make_transforms(opt, transforms_cls, translator.fields) data_transform = [ transforms[name] for name in opt.transforms if name in transforms ] transform = TransformPipe.build_from(data_transform) for i, (src_shard, tgt_shard, feats_shard) in enumerate(data_reader): logger.info("Translating shard %d." % i) translator.translate_dynamic(src=src_shard, transform=transform, src_feats=feats_shard, tgt=tgt_shard, batch_size=opt.batch_size, batch_type=opt.batch_type, attn_debug=opt.attn_debug, align_debug=opt.align_debug)
def build_corpora_iters(corpora, transforms, corpora_info, is_train=False, skip_empty_level='warning', stride=1, offset=0): """Return `ParallelCorpusIterator` for all corpora defined in opts.""" corpora_iters = dict() for c_id, corpus in corpora.items(): transform_names = corpora_info[c_id].get('transforms', []) corpus_transform = [ transforms[name] for name in transform_names if name in transforms ] transform_pipe = TransformPipe.build_from(corpus_transform) logger.info(f"{c_id}'s transforms: {str(transform_pipe)}") corpus_iter = ParallelCorpusIterator( corpus, transform_pipe, infinitely=is_train, skip_empty_level=skip_empty_level, stride=stride, offset=offset) corpora_iters[c_id] = corpus_iter return corpora_iters
def test_transform_pipe(self): # 1. Init first transform in the pipe prefix_cls = get_transforms_cls(["prefix"])["prefix"] corpora = yaml.safe_load(""" trainset: path_src: data/src-train.txt path_tgt: data/tgt-train.txt transforms: [prefix, filtertoolong] weight: 1 src_prefix: "⦅_pf_src⦆" tgt_prefix: "⦅_pf_tgt⦆" """) opt = Namespace(data=corpora, seed=-1) prefix_transform = prefix_cls(opt) prefix_transform.warm_up() # 2. Init second transform in the pipe filter_cls = get_transforms_cls(["filtertoolong"])["filtertoolong"] opt = Namespace(src_seq_length=4, tgt_seq_length=4) filter_transform = filter_cls(opt) # 3. Sequential combine them into a transform pipe transform_pipe = TransformPipe.build_from( [prefix_transform, filter_transform]) ex = { "src": ["Hello", ",", "world", "."], "tgt": ["Bonjour", "le", "monde", "."], } # 4. apply transform pipe for example ex_after = transform_pipe.apply(copy.deepcopy(ex), corpus_name="trainset") # 5. example after the pipe exceed the length limit, thus filtered self.assertIsNone(ex_after) # 6. Transform statistics registed (here for filtertoolong) self.assertTrue(len(transform_pipe.statistics.observables) > 0) msg = transform_pipe.statistics.report() self.assertIsNotNone(msg) # 7. after report, statistics become empty as a fresh start self.assertTrue(len(transform_pipe.statistics.observables) == 0)