def __init__(self, total_steps: int, warmup: int = 0, t_scale: float = 1.0) -> None: super().__init__(warmup, t_scale) check_condition(total_steps >= 0, "total_steps need to be >= 0.") self.total_steps = total_steps
def determine_context(args: argparse.Namespace, exit_stack: ExitStack) -> List[mx.Context]: """ Determine the context we should run on (CPU or GPU). :param args: Arguments as returned by argparse. :param exit_stack: An ExitStack from contextlib. :return: A list with the context(s) to run on. """ if args.use_cpu: logger.info("Training Device: CPU") context = [mx.cpu()] else: num_gpus = utils.get_num_gpus() check_condition(num_gpus >= 1, "No GPUs found, consider running on the CPU with --use-cpu " "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi " "binary isn't on the path).") if args.disable_device_locking: context = utils.expand_requested_device_ids(args.device_ids) else: context = exit_stack.enter_context(utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir)) if args.batch_type == C.BATCH_TYPE_SENTENCE: check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be " "divisible by the number of devices. Choose a batch " "size that is a multiple of %d." % len(context)) logger.info("Training Device(s): GPU %s", context) context = [mx.gpu(gpu_id) for gpu_id in context] return context
def check_encoder_decoder_args(args) -> None: """ Check possible encoder-decoder argument conflicts. :param args: Arguments as returned by argparse. """ encoder_embed_dropout, decoder_embed_dropout = args.embed_dropout encoder_rnn_dropout_inputs, decoder_rnn_dropout_inputs = args.rnn_dropout_inputs encoder_rnn_dropout_states, decoder_rnn_dropout_states = args.rnn_dropout_states if encoder_embed_dropout > 0 and encoder_rnn_dropout_inputs > 0: logger.warning( "Setting encoder RNN AND source embedding dropout > 0 leads to " "two dropout layers on top of each other.") if decoder_embed_dropout > 0 and decoder_rnn_dropout_inputs > 0: logger.warning( "Setting encoder RNN AND source embedding dropout > 0 leads to " "two dropout layers on top of each other.") encoder_rnn_dropout_recurrent, decoder_rnn_dropout_recurrent = args.rnn_dropout_recurrent if encoder_rnn_dropout_recurrent > 0 or decoder_rnn_dropout_recurrent > 0: check_condition( args.rnn_cell_type == C.LSTM_TYPE, "Recurrent dropout without memory loss only supported for LSTMs right now." ) if args.rnn_decoder_stochastic: check_condition( args.optimized_metric, "When using stochastic models --optimized-metric needs to be 'elbo'." )
def __init__(self, rnn_config: rnn.RNNConfig, prefix=C.BIDIRECTIONALRNN_PREFIX, layout=C.TIME_MAJOR, encoder_class: Callable = RecurrentEncoder) -> None: utils.check_condition( rnn_config.num_hidden % 2 == 0, "num_hidden must be a multiple of 2 for BiDirectionalRNNEncoders.") self.rnn_config = rnn_config self.internal_rnn_config = rnn_config.copy( num_hidden=rnn_config.num_hidden // 2) if layout[0] == 'N': logger.warning( "Batch-major layout for encoder input. Consider using time-major layout for faster speed" ) # time-major layout as _encode needs to swap layout for SequenceReverse self.forward_rnn = encoder_class(rnn_config=self.internal_rnn_config, prefix=prefix + C.FORWARD_PREFIX, layout=C.TIME_MAJOR) self.reverse_rnn = encoder_class(rnn_config=self.internal_rnn_config, prefix=prefix + C.REVERSE_PREFIX, layout=C.TIME_MAJOR) self.layout = layout self.prefix = prefix
def __init__(self, num_embed: int, prefix: str) -> None: utils.check_condition(num_embed % 2 == 0, "Positional embeddings require an even embedding size it " "is however %d." % num_embed) self.num_embed = num_embed self.prefix = prefix
def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, beta3_batch: float = 0.999, beta3_checkpoint: float = 0., epsilon: float = 1e-8, k_lo: float = 0.1, k_hi: float = 10, schedule_decay: float = 0.004, use_batch_objective: bool = True, use_checkpoint_objective: bool = False, use_nesterov_momentum: bool = False, **kwargs) -> None: check_condition( any((use_batch_objective, use_checkpoint_objective)), "Must use at least one of: batch objective, checkpoint objective") super().__init__(learning_rate=learning_rate, **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.beta3_batch = beta3_batch self.beta3_checkpoint = beta3_checkpoint self.epsilon = epsilon self.k_lo = k_lo self.k_hi = k_hi self.schedule_decay = schedule_decay self.use_batch_objective = use_batch_objective self.use_checkpoint_objective = use_checkpoint_objective self.use_nesterov_momentum = use_nesterov_momentum
def make_inputs(inp: Optional[str], translator: inference.Translator, json_input: bool, inp_factors: Optional[List[str]] = None) -> Generator[inference.TranslatorInput, None, None]: """ Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1, the function will look for factors attached to each token, separated by '|'. If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor filenames are required. :param inp: The source file (possibly None). :param translator: Translator that will translate each line of input. :param json_input: Whether the input is in json format. :param inp_factors: Source factor files. :return: TranslatorInput objects. """ if inp is None: check_condition(inp_factors is None, "Translating from STDIN, not expecting any factor files.") for sentence_id, line in enumerate(sys.stdin, 1): if json_input: yield inference.make_input_from_json_string(sentence_id=sentence_id, json_string=line) else: yield inference.make_input_from_factored_string(sentence_id=sentence_id, factored_string=line, translator=translator) else: inp_factors = [] if inp_factors is None else inp_factors inputs = [inp] + inp_factors check_condition(translator.num_source_factors == len(inputs), "Model(s) require %d factors, but %d given (through --input and --input-factors)." % ( translator.num_source_factors, len(inputs))) with ExitStack() as exit_stack: streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs] for sentence_id, inputs in enumerate(zip(*streams), 1): yield inference.make_input_from_multiple_strings(sentence_id=sentence_id, strings=list(inputs))
def _setup_context(args, exit_stack): if args.use_cpu: context = mx.cpu() else: num_gpus = get_num_gpus() check_condition( num_gpus >= 1, "No GPUs found, consider running on the CPU with --use-cpu " "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi " "binary isn't on the path).") check_condition( len(args.device_ids) == 1, "cannot run on multiple devices for now") gpu_id = args.device_ids[0] if args.disable_device_locking: # without locking and a negative device id we just take the first device gpu_id = 0 else: if gpu_id < 0: # get a single (!) gpu id automatically: gpu_ids = exit_stack.enter_context( acquire_gpus([-1], lock_dir=args.lock_dir)) gpu_id = gpu_ids[0] context = mx.gpu(gpu_id) return context
def __init__(self, config: ConvolutionalEmbeddingConfig, prefix: str = C.CHAR_SEQ_ENCODER_PREFIX) -> None: utils.check_condition(len(config.num_filters) == config.max_filter_width, "num_filters must have max_filter_width elements.") self.num_embed = config.num_embed self.output_dim = config.output_dim self.max_filter_width = config.max_filter_width self.num_filters = config.num_filters[:] self.pool_stride = config.pool_stride self.num_highway_layers = config.num_highway_layers self.prefix = prefix self.dropout = config.dropout self.add_positional_encoding = config.add_positional_encoding self.conv_weight = {filter_width: mx.sym.Variable("%s%s%d%s" % (self.prefix, "conv_", filter_width, "_weight")) for filter_width in range(1, self.max_filter_width + 1)} self.conv_bias = {filter_width: mx.sym.Variable("%s%s%d%s" % (self.prefix, "conv_", filter_width, "_bias")) for filter_width in range(1, self.max_filter_width + 1)} self.project_weight = mx.sym.Variable(self.prefix + "project_weight") self.project_bias = mx.sym.Variable(self.prefix + "project_bias") self.gate_weight = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_weight")) for i in range(self.num_highway_layers)] self.gate_bias = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_bias")) for i in range(self.num_highway_layers)] self.transform_weight = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_weight")) for i in range(self.num_highway_layers)] self.transform_bias = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_bias")) for i in range(self.num_highway_layers)]
def main(): params = argparse.ArgumentParser(description='Translate CLI') arguments.add_translate_cli_args(params) args = params.parse_args() if args.output is not None: global logger logger = setup_main_logger(__name__, console=not args.quiet, file_logging=True, path="%s.%s" % (args.output, C.LOG_NAME)) if args.checkpoints is not None: check_condition( len(args.checkpoints) == len(args.models), "must provide checkpoints for each model") log_basic_info(args) output_handler = get_output_handler(args.output_type, args.output, args.sure_align_threshold) with ExitStack() as exit_stack: context = _setup_context(args, exit_stack) models, source_vocabs, target_vocab = inference.load_models( context=context, max_input_len=args.max_input_len, beam_size=args.beam_size, batch_size=args.batch_size, model_folders=args.models, checkpoints=args.checkpoints, softmax_temperature=args.softmax_temperature, max_output_length_num_stds=args.max_output_length_num_stds, decoder_return_logit_inputs=args.restrict_lexicon is not None, cache_output_layer_w_b=args.restrict_lexicon is not None) restrict_lexicon = None # type: Optional[TopKLexicon] if args.restrict_lexicon: restrict_lexicon = TopKLexicon(source_vocabs[0], target_vocab) restrict_lexicon.load(args.restrict_lexicon, k=args.restrict_lexicon_topk) store_beam = args.output_type == C.OUTPUT_HANDLER_BEAM_STORE translator = inference.Translator( context=context, ensemble_mode=args.ensemble_mode, bucket_source_width=args.bucket_width, length_penalty=inference.LengthPenalty(args.length_penalty_alpha, args.length_penalty_beta), models=models, source_vocabs=source_vocabs, target_vocab=target_vocab, restrict_lexicon=restrict_lexicon, store_beam=store_beam, strip_unknown_words=args.strip_unknown_words) read_and_translate(translator=translator, output_handler=output_handler, chunk_size=args.chunk_size, inp=args.input, inp_factors=args.input_factors, json_input=args.json_input)
def average(param_paths: Iterable[str]) -> Dict[str, mx.nd.NDArray]: """ Averages parameters from a list of .params file paths. :param param_paths: List of paths to parameter files. :return: Averaged parameter dictionary. """ all_arg_params = [] all_aux_params = [] for path in param_paths: logger.info("Loading parameters from '%s'", path) arg_params, aux_params = sockeye.utils.load_params(path) all_arg_params.append(arg_params) all_aux_params.append(aux_params) logger.info("%d models loaded", len(all_arg_params)) check_condition( all(all_arg_params[0].keys() == p.keys() for p in all_arg_params), "arg_param names do not match across models") check_condition( all(all_aux_params[0].keys() == p.keys() for p in all_aux_params), "aux_param names do not match across models") avg_params = {} # average arg_params for k in all_arg_params[0]: arrays = [p[k] for p in all_arg_params] avg_params["arg:" + k] = sockeye.utils.average_arrays(arrays) # average aux_params for k in all_aux_params[0]: arrays = [p[k] for p in all_aux_params] avg_params["aux:" + k] = sockeye.utils.average_arrays(arrays) return avg_params
def main(): params = argparse.ArgumentParser(description='Translate CLI') arguments.add_inference_args(params) arguments.add_device_args(params) args = params.parse_args() if args.output is not None: global logger logger = setup_main_logger(__name__, file_logging=True, path="%s.%s" % (args.output, C.LOG_NAME)) if args.checkpoints is not None: check_condition( len(args.checkpoints) == len(args.models), "must provide checkpoints for each model") log_sockeye_version(logger) logger.info("Command: %s", " ".join(sys.argv)) logger.info("Arguments: %s", args) output_handler = sockeye.output_handler.get_output_handler( args.output_type, args.output, args.sure_align_threshold) with ExitStack() as exit_stack: context = _setup_context(args, exit_stack) translator = sockeye.inference.Translator( context, args.ensemble_mode, *sockeye.inference.load_models(context, args.max_input_len, args.beam_size, args.models, args.checkpoints, args.softmax_temperature)) read_and_translate(translator, output_handler, args.input)
def determine_context(args: argparse.Namespace, exit_stack: ExitStack) -> List[mx.Context]: """ Determine the context we should run on (CPU or GPU). :param args: Arguments as returned by argparse. :param exit_stack: An ExitStack from contextlib. :return: A list with the context(s) to run on. """ if args.use_cpu: logger.info("Device: CPU") context = [mx.cpu()] else: num_gpus = utils.get_num_gpus() check_condition( num_gpus >= 1, "No GPUs found, consider running on the CPU with --use-cpu " "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi " "binary isn't on the path).") if args.disable_device_locking: context = utils.expand_requested_device_ids(args.device_ids) else: context = exit_stack.enter_context( utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir)) logger.info("Device(s): GPU %s", context) context = [mx.gpu(gpu_id) for gpu_id in context] return context
def read_sentences(path: str, vocab: Dict[str, int], add_bos=False, limit=None) -> List[List[int]]: """ Reads sentences from path and creates word id sentences. :param path: Path to read data from. :param vocab: Vocabulary mapping. :param add_bos: Whether to add Beginning-Of-Sentence (BOS) symbol. :param limit: Read limit. :return: List of integer sequences. """ assert C.UNK_SYMBOL in vocab assert C.UNK_SYMBOL in vocab assert vocab[C.PAD_SYMBOL] == C.PAD_ID assert C.BOS_SYMBOL in vocab assert C.EOS_SYMBOL in vocab sentences = [] for sentence_tokens in read_content(path, limit): sentence = tokens2ids(sentence_tokens, vocab) check_condition(sentence, "Empty sentence in file %s" % path) if add_bos: sentence.insert(0, vocab[C.BOS_SYMBOL]) sentences.append(sentence) logger.info("%d sentences loaded from '%s'", len(sentences), path) return sentences
def __init__(self, weight_type: str, num_embed: int, max_seq_len: int, scale_up_input: bool, scale_down_positions: bool, dtype: Optional[pt.dtype] = None) -> None: utils.check_condition( num_embed % 2 == 0, "Positional embeddings require an even embedding size it " "is however %d." % num_embed) super().__init__() self.weight_type = weight_type self.num_embed = num_embed self.max_seq_len = max_seq_len self.scale_up_input = scale_up_input self.scale_down_positions = scale_down_positions if self.weight_type == C.FIXED_POSITIONAL_EMBEDDING: weight = get_positional_embeddings(length=self.max_seq_len, depth=self.num_embed) if self.scale_down_positions: weight *= self.num_embed**-0.5 if dtype is not None: weight = weight.to(dtype) self.weight = pt.nn.Parameter(weight, requires_grad=False) elif self.weight_type == C.LEARNED_POSITIONAL_EMBEDDING: self.weight = pt.nn.Parameter( pt.empty(self.max_seq_len, self.num_embed, dtype=dtype)) else: raise ValueError("weight_type '%s' is not supported!" % self.weight_type)
def main(): params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with ' 'respect to a reference set. If multiple hypotheses files are given' 'the mean and standard deviation of the metrics are reported.') arguments.add_evaluate_args(params) arguments.add_logging_args(params) args = params.parse_args() if args.quiet: logger.setLevel(logging.ERROR) utils.check_condition(args.offset >= 0, "Offset should be non-negative.") log_sockeye_version(logger) logger.info("Command: %s", " ".join(sys.argv)) logger.info("Arguments: %s", args) references = [' '.join(e) for e in data_io.read_content(args.references)] all_hypotheses = [[h.strip() for h in hypotheses] for hypotheses in args.hypotheses] if not args.not_strict: for hypotheses in all_hypotheses: utils.check_condition(len(hypotheses) == len(references), "Number of hypotheses (%d) and references (%d) does not match." % (len(hypotheses), len(references))) logger.info("%d hypothesis set(s) | %d hypotheses | %d references", len(all_hypotheses), len(all_hypotheses[0]), len(references)) metric_info = ["%s\t(s_opt)" % name for name in args.metrics] logger.info("\t".join(metric_info)) metrics = [] # type: List[Tuple[str, Callable]] for name in args.metrics: if name == C.BLEU: func = partial(raw_corpus_bleu, offset=args.offset) elif name == C.CHRF: func = raw_corpus_chrf elif name == C.ROUGE1: func = raw_corpus_rouge1 elif name == C.ROUGE2: func = raw_corpus_rouge2 elif name == C.ROUGEL: func = raw_corpus_rougel else: raise ValueError("Unknown metric %s." % name) metrics.append((name, func)) if not args.sentence: scores = defaultdict(list) # type: Dict[str, List[float]] for hypotheses in all_hypotheses: for name, metric in metrics: scores[name].append(metric(hypotheses, references)) _print_mean_std_score(metrics, scores) else: for hypotheses in all_hypotheses: for h, r in zip(hypotheses, references): scores = defaultdict(list) # type: Dict[str, List[float]] for name, metric in metrics: scores[name].append(metric([h], [r])) _print_mean_std_score(metrics, scores)
def _populate_bucket_batch_sizes(self): """ Compute bucket-specific batch sizes (sentences, average_words) and default bucket batch size. If sentence-based batching: number of sentences is the same for each batch, determines the number of words. If word-based batching: number of sentences for each batch is set to the multiple of number of devices that produces the number of words closest to the target batch size. Average target sentence length (non-padding symbols) is used for word number calculations. Sets: self.bucket_batch_sizes """ # Pre-defined bucket batch sizes if self.bucket_batch_sizes is not None: return # Otherwise compute here self.bucket_batch_sizes = [None for _ in self.buckets] largest_total_batch_size = 0 for buck_idx, bucket_shape in enumerate(self.buckets): # Target/label length with padding padded_seq_len = bucket_shape[1] # Average target/label length excluding padding average_seq_len = self.data_label_average_len[buck_idx] # Word-based: num words determines num sentences # Sentence-based: num sentences determines num words if self.batch_by_words: check_condition( padded_seq_len <= self.batch_size, "Word batch size must cover sequence lengths for all" " buckets: (%d > %d)" % (padded_seq_len, self.batch_size)) # Multiple of number of devices (int) closest to target number of words, assuming each sentence is of # average length batch_size_seq = self.batch_num_devices * round( (self.batch_size / average_seq_len) / self.batch_num_devices) batch_size_word = batch_size_seq * average_seq_len else: batch_size_seq = self.batch_size batch_size_word = batch_size_seq * average_seq_len self.bucket_batch_sizes[buck_idx] = BucketBatchSize( batch_size_seq, batch_size_word) # Track largest batch size by total elements largest_total_batch_size = max(largest_total_batch_size, batch_size_seq * max(*bucket_shape)) # Final step: guarantee that largest bucket by sequence length also has largest total batch size. # When batching by sentences, this will already be the case. if self.batch_by_words: padded_seq_len = max(*self.buckets[-1]) average_seq_len = self.data_label_average_len[-1] while self.bucket_batch_sizes[ -1].batch_size * padded_seq_len < largest_total_batch_size: self.bucket_batch_sizes[-1] = BucketBatchSize( self.bucket_batch_sizes[-1].batch_size + self.batch_num_devices, self.bucket_batch_sizes[-1].average_words_per_batch + self.batch_num_devices * average_seq_len)
def __init__(self, updates_per_checkpoint: int, half_life: int, warmup: int = 0) -> None: super().__init__(warmup) check_condition(updates_per_checkpoint > 0, "updates_per_checkpoint needs to be > 0.") check_condition(half_life > 0, "half_life needs to be > 0.") # 0.5 base_lr = base_lr * sqrt(1 + T * factor) # then factor = 3 ./ T, with T = half_life * updates_per_checkpoint self.factor = 3. / (half_life * updates_per_checkpoint) self.t_last_log = -1 self.log_every_t = int(half_life * updates_per_checkpoint)
def __init__(self, base_lr: float = 1.0, warmup: int = 0, t_scale: float = 1.0) -> None: self.base_lr = base_lr check_condition(warmup >= 0, "warmup needs to be >= 0.") self.warmup = warmup self.t_scale = t_scale self.lr = None # type: Optional[float]
def get_num_hidden(self) -> int: """ Return the representation size of this encoder. """ if isinstance(self.encoders[-1], BatchMajor2TimeMajor): utils.check_condition(len(self.encoders) > 1, "Cannot return num_hidden from a BatchMajor2TimeMajor encoder only") return self.encoders[-2].get_num_hidden() else: return self.encoders[-1].get_num_hidden()
def __init__(self, config: RecurrentDecoderConfig, attention: attentions.Attention, lexicon: Optional[lexicons.Lexicon] = None, prefix=C.DECODER_PREFIX) -> None: # TODO: implement variant without input feeding self.rnn_config = config.rnn_config self.target_vocab_size = config.vocab_size self.num_target_embed = config.num_embed self.attention = attention self.weight_tying = config.weight_tying self.context_gating = config.context_gating self.layer_norm = config.layer_normalization self.lexicon = lexicon self.prefix = prefix self.num_hidden = self.rnn_config.num_hidden if self.context_gating: self.gate_w = mx.sym.Variable("%sgate_weight" % prefix) self.gate_b = mx.sym.Variable("%sgate_bias" % prefix) self.mapped_rnn_output_w = mx.sym.Variable( "%smapped_rnn_output_weight" % prefix) self.mapped_rnn_output_b = mx.sym.Variable( "%smapped_rnn_output_bias" % prefix) self.mapped_context_w = mx.sym.Variable("%smapped_context_weight" % prefix) self.mapped_context_b = mx.sym.Variable("%smapped_context_bias" % prefix) # Stacked RNN self.rnn = rnn.get_stacked_rnn(self.rnn_config, self.prefix) # RNN init state parameters self._create_layer_parameters() # Hidden state parameters self.hidden_w = mx.sym.Variable("%shidden_weight" % prefix) self.hidden_b = mx.sym.Variable("%shidden_bias" % prefix) self.hidden_norm = LayerNormalization( self.num_hidden, prefix="%shidden_norm" % prefix) if self.layer_norm else None # Embedding & output parameters self.embedding = encoder.Embedding(self.num_target_embed, self.target_vocab_size, prefix=C.TARGET_EMBEDDING_PREFIX, dropout=0.) # TODO dropout? if self.weight_tying: check_condition( self.num_hidden == self.num_target_embed, "Weight tying requires target embedding size and rnn_num_hidden to be equal" ) self.cls_w = self.embedding.embed_weight else: self.cls_w = mx.sym.Variable("%scls_weight" % prefix) self.cls_b = mx.sym.Variable("%scls_bias" % prefix)
def __init__(self, updates_per_checkpoint: int, half_life: int) -> None: check_condition(updates_per_checkpoint > 0, "updates_per_checkpoint needs to be > 0.") check_condition(half_life > 0, "half_life needs to be > 0.") # Note: will be overwritten by optimizer self.base_lr = None # 0.5 base_lr = base_lr * (1 + T * factor) # then factor = 1 ./ T, with T = half_life * updates_per_checkpoint self.factor = 1. / (half_life * updates_per_checkpoint) self.t_last_log = -1 self.log_every_t = int(half_life * updates_per_checkpoint)
def check_arg_compatibility(args: argparse.Namespace): """ Check if some arguments are incompatible with each other. :param args: Arguments as returned by argparse. """ if args.use_fused_rnn: check_condition(not args.use_cpu, "GPU required for FusedRNN cells") check_condition(args.optimized_metric == C.BLEU or args.optimized_metric in args.metrics, "Must optimize either BLEU or one of tracked metrics (--metrics)")
def main(): params = argparse.ArgumentParser(description='Translate CLI') arguments.add_translate_cli_args(params) args = params.parse_args() if args.output is not None: global logger logger = setup_main_logger(__name__, console=not args.quiet, file_logging=True, path="%s.%s" % (args.output, C.LOG_NAME)) if args.checkpoints is not None: check_condition( len(args.checkpoints) == len(args.models), "must provide checkpoints for each model") log_basic_info(args) output_handler = sockeye.output_handler.get_output_handler( args.output_type, args.output, args.sure_align_threshold) with ExitStack() as exit_stack: context = _setup_context(args, exit_stack) models, vocab_source, vocab_target = sockeye.inference.load_models( context, args.max_input_len, args.beam_size, args.batch_size, args.models, args.checkpoints, args.softmax_temperature, args.max_output_length_num_stds, decoder_return_logit_inputs=args.restrict_lexicon is not None, cache_output_layer_w_b=args.restrict_lexicon is not None, input_dim=args.input_dim) restrict_lexicon = None # type: TopKLexicon if args.restrict_lexicon: restrict_lexicon = TopKLexicon(vocab_source, vocab_target) restrict_lexicon.load(args.restrict_lexicon) translator = sockeye.inference.Translator( context, args.ensemble_mode, args.bucket_width, sockeye.inference.LengthPenalty(args.length_penalty_alpha, args.length_penalty_beta), models, vocab_source, vocab_target, restrict_lexicon, input_dim=args.input_dim) read_and_translate(translator, output_handler, args.chunk_size, args.input)
def __init__(self, model_folder: str, context: mx.context.Context, fused: bool, max_input_len: Optional[int], beam_size: int, checkpoint: Optional[int] = None, softmax_temperature: Optional[float] = None): # load config & determine parameter file super().__init__( sockeye.model.SockeyeModel.load_config( os.path.join(model_folder, C.CONFIG_NAME))) fname_params = os.path.join( model_folder, C.PARAMS_NAME % checkpoint if checkpoint else C.PARAMS_BEST_NAME) if max_input_len is None: max_input_len = self.config.max_seq_len else: if max_input_len != self.config.max_seq_len: logger.warning( "Model was trained with max_seq_len=%d, but using max_input_len=%d.", self.config.max_seq_len, max_input_len) self.max_input_len = max_input_len check_condition( beam_size < self.config.vocab_target_size, 'The beam size must be smaller than the target vocabulary size.') self.beam_size = beam_size self.softmax_temperature = softmax_temperature self.encoder_batch_size = 1 self.context = context self._build_model_components(self.max_input_len, fused) self.encoder_module, self.decoder_module = self._build_modules() self.decoder_data_shapes_cache = dict() # bucket_key -> shape cache max_encoder_data_shapes = self._get_encoder_data_shapes( self.max_input_len) max_decoder_data_shapes = self._get_decoder_data_shapes( self.max_input_len) self.encoder_module.bind(data_shapes=max_encoder_data_shapes, for_training=False, grad_req="null") self.decoder_module.bind(data_shapes=max_decoder_data_shapes, for_training=False, grad_req="null") self.load_params_from_file(fname_params) self.encoder_module.init_params(arg_params=self.params, allow_missing=False) self.decoder_module.init_params(arg_params=self.params, allow_missing=False)
def iterate_doc_level(source_iterators: Iterator, source_pre_iterators: Sequence[Iterator], source_nxt_iterators: Sequence[Iterator], target_pre_iterators: Sequence[Iterator], target_nxt_iterators: Sequence[Iterator]): """ Generator that yields current source sentences and context sentences. :param source_iterators: Current source sentences. :param source_pre_iterators: Previous source sentences. :param source_nxt_iterators: Next source sentences. :param target_pre_iterators: Previous target sentences. :param target_nxt_iterators: Next target sentences. :return: Current source string sequence including context ones. """ while True: try: sources = next(source_iterators) source_pre = [ next(source_pre_iter) for source_pre_iter in source_pre_iterators ] source_nxt = [ next(source_nxt_iter) for source_nxt_iter in source_nxt_iterators ] target_pre = [ next(target_pre_iter) for target_pre_iter in target_pre_iterators ] target_nxt = [ next(target_nxt_iter) for target_nxt_iter in target_nxt_iterators ] except StopIteration: break yield source_pre, sources, source_nxt, target_pre, target_nxt check_condition( next(cast(Iterator, source_iterators), None) is None, "Different number of lines in the source original data") check_condition( all( next(cast(Iterator, src_pre), None) is None for src_pre in source_pre_iterators) and all( next(cast(Iterator, src_nxt), None) is None for src_nxt in source_nxt_iterators) and all( next(cast(Iterator, tar_pre), None) is None for tar_pre in target_pre_iterators) and all( next(cast(Iterator, tar_nxt), None) is None for tar_nxt in target_nxt_iterators), "Different number of lines in the additional data")
def get_recurrent_encoder(config: RecurrentEncoderConfig, fused: bool, embed_weight: Optional[mx.sym.Symbol] = None) -> 'Encoder': """ Returns a recurrent encoder with embedding, batch2time-major conversion, and bidirectional RNN. If num_layers > 1, adds additional uni-directional RNNs. :param config: Configuration for recurrent encoder. :param fused: Whether to use FusedRNNCell (CuDNN). Only works with GPU context. :param embed_weight: Optionally use an existing embedding matrix instead of creating a new one. :return: Encoder instance. """ # TODO give more control on encoder architecture encoders = list() # type: List[Encoder] encoders.append(Embedding(num_embed=config.num_embed, vocab_size=config.vocab_size, prefix=C.SOURCE_EMBEDDING_PREFIX, dropout=config.embed_dropout, embed_weight=embed_weight)) if config.conv_config is not None: encoders.append(ConvolutionalEmbeddingEncoder(config.conv_config, prefix=C.CHAR_SEQ_ENCODER_PREFIX)) if config.conv_config.add_positional_encoding: # If specified, add positional encodings to segment embeddings encoders.append(AddSinCosPositionalEmbeddings(num_embed=config.num_embed, prefix="%sadd_positional_encodings" % C.CHAR_SEQ_ENCODER_PREFIX)) encoders.append(BatchMajor2TimeMajor()) if config.reverse_input: encoders.append(ReverseSequence()) if config.rnn_config.residual: utils.check_condition(config.rnn_config.first_residual_layer >= 2, "Residual connections on the first encoder layer are not supported") encoder_class = FusedRecurrentEncoder if fused else RecurrentEncoder # One layer bi-directional RNN: encoders.append(BiDirectionalRNNEncoder(rnn_config=config.rnn_config.copy(num_layers=1), prefix=C.BIDIRECTIONALRNN_PREFIX, layout=C.TIME_MAJOR)) if config.rnn_config.num_layers > 1: # Stacked uni-directional RNN: # Because we already have a one layer bi-rnn we reduce the num_layers as well as the first_residual_layer. remaining_rnn_config = config.rnn_config.copy(num_layers=config.rnn_config.num_layers - 1, first_residual_layer=config.rnn_config.first_residual_layer - 1) encoders.append(encoder_class(rnn_config=remaining_rnn_config, prefix=C.STACKEDRNN_PREFIX, layout=C.TIME_MAJOR)) return EncoderSequence(encoders)
def __init__(self, reduce_factor: float, reduce_num_not_improved: int, warmup: int = 0) -> None: super().__init__(warmup) check_condition(0.0 < reduce_factor <= 1, "reduce_factor should be in ]0,1].") self.reduce_factor = reduce_factor self.reduce_num_not_improved = reduce_num_not_improved self.num_not_improved = 0 self.lr = None # type: float self.t_last_log = -1 self.warmed_up = not self.warmup > 0 logger.info("Will reduce the learning rate by a factor of %.2f whenever" " the validation score doesn't improve %d times.", reduce_factor, reduce_num_not_improved)
def check_arg_compatibility(args: argparse.Namespace): """ Check if some arguments are incompatible with each other. :param args: Arguments as returned by argparse. """ check_condition(args.optimized_metric == C.BLEU or args.optimized_metric in args.metrics, "Must optimize either BLEU or one of tracked metrics (--metrics)") if args.encoder == C.TRANSFORMER_TYPE: check_condition(args.transformer_model_size == args.num_embed[0], "Source embedding size must match transformer model size: %s vs. %s" % (args.transformer_model_size, args.num_embed[0])) total_source_factor_size = sum(args.source_factors_num_embed) if total_source_factor_size > 0: adjusted_transformer_encoder_model_size = args.num_embed[0] + total_source_factor_size check_condition(adjusted_transformer_encoder_model_size % 2 == 0 and adjusted_transformer_encoder_model_size % args.transformer_attention_heads == 0, "Sum of source factor sizes, i.e. num-embed plus source-factors-num-embed, (%d) " "has to be even and a multiple of attention heads (%d)" % ( adjusted_transformer_encoder_model_size, args.transformer_attention_heads)) if args.decoder == C.TRANSFORMER_TYPE: check_condition(args.transformer_model_size == args.num_embed[1], "Target embedding size must match transformer model size: %s vs. %s" % (args.transformer_model_size, args.num_embed[1]))
def __init__(self, context: mx.context.Context, inputs: str, references: str, model: str, max_input_len: int, beam_size: int = C.DEFAULT_BEAM_SIZE, bucket_width_source: int = 10, bucket_width_target: int = 10, length_penalty_alpha: float = 1.0, length_penalty_beta: float = 0.0, softmax_temperature: Optional[float] = None, max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH, ensemble_mode: str = 'linear', sample_size: int = -1, random_seed: int = 42) -> None: self.context = context self.max_input_len = max_input_len self.max_output_length_num_stds = max_output_length_num_stds self.ensemble_mode = ensemble_mode self.beam_size = beam_size self.bucket_width_source = bucket_width_source self.bucket_width_target = bucket_width_target self.length_penalty_alpha = length_penalty_alpha self.length_penalty_beta = length_penalty_beta self.softmax_temperature = softmax_temperature self.model = model with smart_open(inputs) as inputs_fin, smart_open(references) as references_fin: input_sentences = inputs_fin.readlines() target_sentences = references_fin.readlines() check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match") if sample_size <= 0: sample_size = len(input_sentences) if sample_size < len(input_sentences): # custom random number generator to guarantee the same samples across runs in order to be able to # compare metrics across independent runs random_gen = random.Random(random_seed) self.input_sentences, self.target_sentences = zip( *random_gen.sample(list(zip(input_sentences, target_sentences)), sample_size)) else: self.input_sentences, self.target_sentences = input_sentences, target_sentences logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)", max_input_len, beam_size, model, len(self.input_sentences)) with smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \ smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out: [trg_out.write(s) for s in self.target_sentences] [src_out.write(s) for s in self.input_sentences]
def __init__(self, schedule: List[Tuple[float, int]], updates_per_checkpoint: int) -> None: super().__init__() check_condition(all(num_updates > 0 for (_, num_updates) in schedule), "num_updates for each step should be > 0.") check_condition(all(num_updates % updates_per_checkpoint == 0 for (_, num_updates) in schedule), "num_updates for each step should be divisible by updates_per_checkpoint.") self.schedule = schedule self.current_step = 0 self.current_rate = 0. self.current_step_num_updates = 0 self.current_step_started_at = 0 self.next_step_at = 0 self.latest_t = 0 self._update_rate(self.current_step)
def make_inputs(input_file: Optional[str], translator: inference.Translator, input_is_json: bool, input_factors: Optional[List[str]] = None) -> Generator[inference.TranslatorInput, None, None]: """ Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1, the function will look for factors attached to each token, separated by '|'. If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor filenames are required. :param input_file: The source file (possibly None). :param translator: Translator that will translate each line of input. :param input_is_json: Whether the input is in json format. :param input_factors: Source factor files. :return: TranslatorInput objects. """ if input_file is None: check_condition(input_factors is None, "Translating from STDIN, not expecting any factor files.") for sentence_id, line in enumerate(sys.stdin, 1): if input_is_json: yield inference.make_input_from_json_string(sentence_id=sentence_id, json_string=line) else: yield inference.make_input_from_factored_string(sentence_id=sentence_id, factored_string=line, translator=translator) else: input_factors = [] if input_factors is None else input_factors inputs = [input_file] + input_factors check_condition(translator.num_source_factors == len(inputs), "Model(s) require %d factors, but %d given (through --input and --input-factors)." % ( translator.num_source_factors, len(inputs))) with ExitStack() as exit_stack: streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs] for sentence_id, inputs in enumerate(zip(*streams), 1): if input_is_json: yield inference.make_input_from_json_string(sentence_id=sentence_id, json_string=inputs[0]) else: yield inference.make_input_from_multiple_strings(sentence_id=sentence_id, strings=list(inputs))
def get_lr_scheduler(scheduler_type: str, updates_per_checkpoint: int, learning_rate_half_life: int, learning_rate_reduce_factor: float, learning_rate_reduce_num_not_improved: int, learning_rate_schedule: Optional[List[Tuple[float, int]]] = None, learning_rate_warmup: Optional[int] = 0) -> Optional[LearningRateScheduler]: """ Returns a learning rate scheduler. :param scheduler_type: Scheduler type. :param updates_per_checkpoint: Number of batches between checkpoints. :param learning_rate_half_life: Half life of the learning rate in number of checkpoints. :param learning_rate_reduce_factor: Factor to reduce learning rate with. :param learning_rate_reduce_num_not_improved: Number of checkpoints with no improvement after which learning rate is reduced. :param learning_rate_schedule: Optional fixed learning rate schedule. :param learning_rate_warmup: Number of batches that the learning rate is linearly increased. :raises: ValueError if unknown scheduler_type :return: Learning rate scheduler. """ check_condition(learning_rate_schedule is None or scheduler_type == C.LR_SCHEDULER_FIXED_STEP, "Learning rate schedule can only be used with '%s' learning rate scheduler." % C.LR_SCHEDULER_FIXED_STEP) if scheduler_type is None: return None if scheduler_type == C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T: return LearningRateSchedulerInvSqrtT(updates_per_checkpoint, learning_rate_half_life, learning_rate_warmup) elif scheduler_type == C.LR_SCHEDULER_FIXED_RATE_INV_T: return LearningRateSchedulerInvT(updates_per_checkpoint, learning_rate_half_life, learning_rate_warmup) elif scheduler_type == C.LR_SCHEDULER_FIXED_STEP: check_condition(learning_rate_schedule is not None, "learning_rate_schedule needed for %s scheduler" % C.LR_SCHEDULER_FIXED_STEP) return LearningRateSchedulerFixedStep(learning_rate_schedule, updates_per_checkpoint) elif scheduler_type == C.LR_SCHEDULER_PLATEAU_REDUCE: check_condition(learning_rate_reduce_factor is not None, "learning_rate_reduce_factor needed for %s scheduler" % C.LR_SCHEDULER_PLATEAU_REDUCE) check_condition(learning_rate_reduce_num_not_improved is not None, "learning_rate_reduce_num_not_improved needed for %s scheduler" % C.LR_SCHEDULER_PLATEAU_REDUCE) if learning_rate_reduce_factor >= 1.0: logger.warning("Not using %s learning rate scheduling: learning_rate_reduce_factor == 1.0" % C.LR_SCHEDULER_PLATEAU_REDUCE) return None return LearningRateSchedulerPlateauReduce(learning_rate_reduce_factor, learning_rate_reduce_num_not_improved, learning_rate_warmup) else: raise ValueError("Unknown learning rate scheduler type %s." % scheduler_type)
def run_translate(args: argparse.Namespace): if args.output is not None: global logger logger = setup_main_logger(__name__, console=not args.quiet, file_logging=True, path="%s.%s" % (args.output, C.LOG_NAME)) if args.checkpoints is not None: check_condition(len(args.checkpoints) == len(args.models), "must provide checkpoints for each model") log_basic_info(args) output_handler = get_output_handler(args.output_type, args.output, args.sure_align_threshold) with ExitStack() as exit_stack: check_condition(len(args.device_ids) == 1, "translate only supports single device for now") context = determine_context(device_ids=args.device_ids, use_cpu=args.use_cpu, disable_device_locking=args.disable_device_locking, lock_dir=args.lock_dir, exit_stack=exit_stack)[0] logger.info("Translate Device: %s", context) if args.override_dtype == C.DTYPE_FP16: logger.warning('Experimental feature \'--override-dtype float16\' has been used. ' 'This feature may be removed or change its behaviour in future. ' 'DO NOT USE IT IN PRODUCTION!') models, source_vocabs, target_vocab = inference.load_models( context=context, max_input_len=args.max_input_len, beam_size=args.beam_size, batch_size=args.batch_size, model_folders=args.models, checkpoints=args.checkpoints, softmax_temperature=args.softmax_temperature, max_output_length_num_stds=args.max_output_length_num_stds, decoder_return_logit_inputs=args.restrict_lexicon is not None, cache_output_layer_w_b=args.restrict_lexicon is not None, override_dtype=args.override_dtype) restrict_lexicon = None # type: Optional[TopKLexicon] if args.restrict_lexicon: restrict_lexicon = TopKLexicon(source_vocabs[0], target_vocab) restrict_lexicon.load(args.restrict_lexicon, k=args.restrict_lexicon_topk) store_beam = args.output_type == C.OUTPUT_HANDLER_BEAM_STORE translator = inference.Translator(context=context, ensemble_mode=args.ensemble_mode, bucket_source_width=args.bucket_width, length_penalty=inference.LengthPenalty(args.length_penalty_alpha, args.length_penalty_beta), beam_prune=args.beam_prune, beam_search_stop=args.beam_search_stop, models=models, source_vocabs=source_vocabs, target_vocab=target_vocab, restrict_lexicon=restrict_lexicon, avoid_list=args.avoid_list, store_beam=store_beam, strip_unknown_words=args.strip_unknown_words) read_and_translate(translator=translator, output_handler=output_handler, chunk_size=args.chunk_size, input_file=args.input, input_factors=args.input_factors, input_is_json=args.json_input)
def __init__(self, warmup: int = 0) -> None: self.base_lr = None # Note: will be overwritten by MXNet optimizer check_condition(warmup >= 0, "warmup needs to be >= 0.") self.warmup = warmup self.log_warmup_every_t = self.warmup // 10 self.last_warmup_log = -1