Beispiel #1
0
 def __init__(self,
              learning_rate: float = 0.001,
              beta1: float = 0.9,
              beta2: float = 0.999,
              beta3_batch: float = 0.999,
              beta3_checkpoint: float = 0.,
              epsilon: float = 1e-8,
              k_lo: float = 0.1,
              k_hi: float = 10,
              schedule_decay: float = 0.004,
              use_batch_objective: bool = True,
              use_checkpoint_objective: bool = False,
              use_nesterov_momentum: bool = False,
              **kwargs) -> None:
     check_condition(
         any((use_batch_objective, use_checkpoint_objective)),
         "Must use at least one of: batch objective, checkpoint objective")
     super().__init__(learning_rate=learning_rate, **kwargs)
     self.beta1 = beta1
     self.beta2 = beta2
     self.beta3_batch = beta3_batch
     self.beta3_checkpoint = beta3_checkpoint
     self.epsilon = epsilon
     self.k_lo = k_lo
     self.k_hi = k_hi
     self.schedule_decay = schedule_decay
     self.use_batch_objective = use_batch_objective
     self.use_checkpoint_objective = use_checkpoint_objective
     self.use_nesterov_momentum = use_nesterov_momentum
Beispiel #2
0
    def __call__(self,
                 hidden: Union[mx.sym.Symbol, mx.nd.NDArray],
                 weight: Optional[mx.nd.NDArray] = None,
                 bias: Optional[mx.nd.NDArray] = None):
        """
        Linear transformation to vocab size. Returns logits.

        :param hidden: Decoder representation for n elements. Shape: (n, self.num_hidden).
        :return: Logits. Shape(n, self.vocab_size).
        """
        if isinstance(hidden, mx.sym.Symbol):
            # TODO dropout?
            return mx.sym.FullyConnected(data=hidden,
                                         num_hidden=self.vocab_size,
                                         weight=self.w,
                                         bias=self.b,
                                         flatten=False,
                                         name=C.LOGITS_NAME)

        # Equivalent NDArray implementation (requires passed weights/biases)
        assert isinstance(hidden, mx.nd.NDArray)
        utils.check_condition(
            weight is not None and bias is not None,
            "OutputLayer NDArray implementation requires passing weight and bias NDArrays."
        )

        return mx.nd.FullyConnected(data=hidden,
                                    num_hidden=bias.shape[0],
                                    weight=weight,
                                    bias=bias,
                                    flatten=False)
Beispiel #3
0
    def __next__(self):
        if self._next is None:
            raise StopIteration

        alignment = self._next
        check_condition(bool(alignment),
                        "Empty alignment in file %s" % self.path)
        if self.add_bos:
            alignment = [i + 1 if i != -1 else -1 for i in alignment]
            alignment.insert(0, 0)

        if self.add_eos:
            alignment.append(self.source_lengths[self.last_idx] - 1)

        self.last_idx += 1

        if not self._iterated_once:
            self.count += 1

        # fetch next element
        self._next = next(self._iter, None)
        if self._next is None:
            self._iter = None
            if not self._iterated_once:
                self._iterated_once = True

        return alignment
Beispiel #4
0
    def __init__(self,
                 rnn_config: rnn.RNNConfig,
                 prefix=C.BIDIRECTIONALRNN_PREFIX,
                 layout=C.TIME_MAJOR,
                 encoder_class: Callable = RecurrentEncoder) -> None:
        utils.check_condition(
            rnn_config.num_hidden % 2 == 0,
            "num_hidden must be a multiple of 2 for BiDirectionalRNNEncoders.")
        self.rnn_config = rnn_config
        self.internal_rnn_config = rnn_config.copy(
            num_hidden=rnn_config.num_hidden // 2)
        if layout[0] == 'N':
            logger.warning(
                "Batch-major layout for encoder input. Consider using time-major layout for faster speed"
            )

        # time-major layout as _encode needs to swap layout for SequenceReverse
        self.forward_rnn = encoder_class(rnn_config=self.internal_rnn_config,
                                         prefix=prefix + C.FORWARD_PREFIX,
                                         layout=C.TIME_MAJOR)
        self.reverse_rnn = encoder_class(rnn_config=self.internal_rnn_config,
                                         prefix=prefix + C.REVERSE_PREFIX,
                                         layout=C.TIME_MAJOR)
        self.layout = layout
        self.prefix = prefix
Beispiel #5
0
    def __next__(self):
        if self._next is None:
            raise StopIteration

        sentence_tokens = self._next
        sentence = tokens2ids(sentence_tokens, self.vocab)
        check_condition(bool(sentence),
                        "Empty sentence in file %s" % self.path)
        if self.add_bos:
            sentence.insert(0, self.vocab[C.BOS_SYMBOL])

        if self.add_eos:
            sentence.append(self.vocab[C.EOS_SYMBOL])

        if not self._iterated_once:
            self.count += 1

        # fetch next element
        self._next = next(self._iter, None)
        if self._next is None:
            self._iter = None
            if not self._iterated_once:
                self._iterated_once = True

        return sentence
Beispiel #6
0
def read_alignment_file(path, trg_lengths, src_lengths):
    """
    read flat alignment file
    :param path: path to alignment file
    :param trg_lengths: array of target lengths (for each sentence)
    :param src_lengths: array of source lengths (for each sentence)
    :return: array of alignments (unprocessed)
    """
    check_condition(
        len(trg_lengths) == len(src_lengths),
        "source and target sentences must be parallel")
    file = smart_open(path)
    content = file.readlines()
    if len(content) == len(trg_lengths):
        is_multiline = False
        alignments = _read_flat_alignment_file(content=content,
                                               trg_lengths=trg_lengths)
    else:
        is_multiline = True
        alignments = _read_multiline_alignment_file(content=content,
                                                    trg_lengths=trg_lengths)

    check_condition(
        len(alignments) == len(trg_lengths), "alignment mst be parallel")
    return alignments, is_multiline
Beispiel #7
0
def main():
    params = argparse.ArgumentParser(
        description='CLI to build source and target vocab(s).')
    arguments.add_build_vocab_args(params)
    args = params.parse_args()

    num_words, num_words_other = args.num_words
    utils.check_condition(
        num_words == num_words_other,
        "Vocabulary CLI only allows a common value for --num-words")
    word_min_count, word_min_count_other = args.word_min_count
    utils.check_condition(
        word_min_count == word_min_count_other,
        "Vocabulary CLI only allows a common value for --word-min-count")

    global logger
    logger = log.setup_main_logger("build_vocab",
                                   file_logging=True,
                                   console=True,
                                   path="%s.%s" % (args.output, C.LOG_NAME))

    vocab = build_from_paths(args.inputs,
                             num_words=num_words,
                             min_count=word_min_count)
    logger.info("Vocabulary size: %d ", len(vocab))
    vocab_to_json(vocab, args.output + C.JSON_SUFFIX)
Beispiel #8
0
def translate(output_handler: output_handler.OutputHandler, source_data: Iterable[str],
              translator: inference.Translator, chunk_id: int = 0,
              reference_data: Iterable[str] = None) -> float:
    """
    Translates each line from source_data, calling output handler after translating a batch.

    :param output_handler: A handler that will be called once with the output of each translation.
    :param source_data: A enumerable list of source sentences that will be translated.
    :param translator: The translator that will be used for each line of input.
    :param chunk_id: Global id of the chunk.
    :param reference_data: A enumerable list of reference sentences that will be force aligned to the source.
    :return: Total time taken.
    """

    tic = time.time()
    trans_inputs = [translator.make_input(i, line, reference)
                    for i, (line, reference) in enumerate(itertools.zip_longest(source_data,
                                                                                reference_data
                                                                                    if reference_data else [None]),
                                                          chunk_id + 1)]
    if translator.dictionary is not None:
        utils.check_condition(translator.batch_size == 1, "Dictionary replacement works only with batch size 1")
        translator.seq_idx = trans_inputs[0].id - 1
    trans_outputs = translator.translate(trans_inputs)
    total_time = time.time() - tic
    batch_time = total_time / len(trans_inputs)
    for trans_input, trans_output in zip(trans_inputs, trans_outputs):
        output_handler.handle(trans_input, trans_output, batch_time)
    return total_time
Beispiel #9
0
def determine_context(args: argparse.Namespace,
                      exit_stack: ExitStack) -> List[mx.Context]:
    """
    Determine the context we should run on (CPU or GPU).

    :param args: Arguments as returned by argparse.
    :param exit_stack: An ExitStack from contextlib.
    :return: A list with the context(s) to run on.
    """
    if args.use_cpu:
        logger.info("Training Device: CPU")
        context = [mx.cpu()]
    else:
        num_gpus = utils.get_num_gpus()
        check_condition(
            num_gpus >= 1,
            "No GPUs found, consider running on the CPU with --use-cpu "
            "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
            "binary isn't on the path).")
        if args.disable_device_locking:
            context = utils.expand_requested_device_ids(args.device_ids)
        else:
            context = exit_stack.enter_context(
                utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir))
        if args.batch_type == C.BATCH_TYPE_SENTENCE:
            check_condition(
                args.batch_size % len(context) == 0,
                "When using multiple devices the batch size must be "
                "divisible by the number of devices. Choose a batch "
                "size that is a multiple of %d." % len(context))
        logger.info("Training Device(s): GPU %s", context)
        context = [mx.gpu(gpu_id) for gpu_id in context]
    return context
Beispiel #10
0
    def _populate_bucket_batch_sizes(self):
        """
        Compute bucket-specific batch sizes (sentences, average_words) and default bucket batch
        size.

        If sentence-based batching: number of sentences is the same for each batch, determines the
        number of words.

        If word-based batching: number of sentences for each batch is set to the multiple of number
        of devices that produces the number of words closest to the target batch size.  Average
        target sentence length (non-padding symbols) is used for word number calculations.

        Sets: self.bucket_batch_sizes
        """
        # Pre-defined bucket batch sizes
        if self.bucket_batch_sizes is not None:
            return
        # Otherwise compute here
        self.bucket_batch_sizes = [None for _ in self.buckets]
        largest_total_batch_size = 0
        for buck_idx, bucket_shape in enumerate(self.buckets):
            # Target/label length with padding
            padded_seq_len = bucket_shape[1]
            # Average target/label length excluding padding
            average_seq_len = self.data_target_average_len[buck_idx]
            # Word-based: num words determines num sentences
            # Sentence-based: num sentences determines num words
            if self.batch_by_words:
                check_condition(
                    padded_seq_len <= self.batch_size,
                    "Word batch size must cover sequence lengths for all"
                    " buckets: (%d > %d)" % (padded_seq_len, self.batch_size))
                # Multiple of number of devices (int) closest to target number of words, assuming each sentence is of
                # average length
                batch_size_seq = self.batch_num_devices * round(
                    (self.batch_size / average_seq_len) /
                    self.batch_num_devices)
                batch_size_word = batch_size_seq * average_seq_len
            else:
                batch_size_seq = self.batch_size
                batch_size_word = batch_size_seq * average_seq_len
            self.bucket_batch_sizes[buck_idx] = BucketBatchSize(
                batch_size_seq, batch_size_word)
            # Track largest batch size by total elements
            largest_total_batch_size = max(largest_total_batch_size,
                                           batch_size_seq * max(*bucket_shape))
        # Final step: guarantee that largest bucket by sequence length also has largest total batch size.
        # When batching by sentences, this will already be the case.
        if self.batch_by_words:
            padded_seq_len = max(*self.buckets[-1])
            average_seq_len = self.data_target_average_len[-1]
            while self.bucket_batch_sizes[
                    -1].batch_size * padded_seq_len < largest_total_batch_size:
                self.bucket_batch_sizes[-1] = BucketBatchSize(
                    self.bucket_batch_sizes[-1].batch_size +
                    self.batch_num_devices,
                    self.bucket_batch_sizes[-1].average_words_per_batch +
                    self.batch_num_devices * average_seq_len)
Beispiel #11
0
 def __init__(self, num_embed: int, prefix: str, scale_up_input: bool,
              scale_down_positions: bool) -> None:
     utils.check_condition(
         num_embed % 2 == 0,
         "Positional embeddings require an even embedding size it "
         "is however %d." % num_embed)
     self.scale_up_input = scale_up_input
     self.scale_down_positions = scale_down_positions
     self.num_embed = num_embed
     self.prefix = prefix
Beispiel #12
0
 def __init__(self,
              num_hidden,
              prefix='lstm_',
              params=None,
              forget_bias=1.0,
              dropout: float = 0.0) -> None:
     super().__init__(num_hidden, prefix, params, forget_bias)
     utils.check_condition(
         dropout > 0.0, "RecurrentDropoutLSTMCell shoud have dropout > 0.0")
     self.dropout = dropout
Beispiel #13
0
 def get_num_hidden(self) -> int:
     """
     Return the representation size of this encoder.
     """
     if isinstance(self.encoders[-1], BatchMajor2TimeMajor):
         utils.check_condition(
             len(self.encoders) > 1,
             "Cannot return num_hidden from a BatchMajor2TimeMajor encoder only"
         )
         return self.encoders[-2].get_num_hidden()
     else:
         return self.encoders[-1].get_num_hidden()
Beispiel #14
0
 def __init__(self,
              kernel_width: int,
              num_hidden: int,
              act_type: str = C.GLU,
              weight_normalization: bool = False):
     super().__init__()
     self.kernel_width = kernel_width
     self.num_hidden = num_hidden
     utils.check_condition(act_type in C.CNN_ACTIVATION_TYPES,
                           "Unknown activation %s." % act_type)
     self.act_type = act_type
     self.weight_normalization = weight_normalization
Beispiel #15
0
    def __call__(self, data: mx.sym.Symbol, data_length: mx.sym.Symbol,
                 seq_len: int) -> mx.sym.Symbol:
        """
        Run the convolutional block.

        :param data: Input data. Shape: (batch_size, seq_len, num_hidden).
        :param data_length: Vector with sequence lengths. Shape: (batch_size,).
        :param seq_len: Maximum sequence length.
        :return: Shape: (batch_size, seq_len, num_hidden).
        """
        if self.pad_type == C.CNN_PAD_LEFT:
            # we pad enough on both sides and later slice the extra padding from the right
            padding = (self.config.kernel_width - 1, )
        elif self.pad_type == C.CNN_PAD_CENTERED:
            # we pad enough so that the output size is equal to the input size and we don't need to slice
            utils.check_condition(
                self.config.kernel_width % 2 == 1,
                "Only odd kernel widths supported, but got %d" %
                self.config.kernel_width)
            padding = (int((self.config.kernel_width - 1) / 2), )
        else:
            raise ValueError("Unknown pad type %s" % self.pad_type)

        num_hidden = self._pre_activation_num_hidden()

        # Apply masking (so that we properly have zero padding for variable sequence length batches)
        # Note: SequenceMask expects time-major data
        # (seq_len, batch_size, num_hidden)
        data = mx.sym.swapaxes(data, dim1=0, dim2=1)
        data = mx.sym.SequenceMask(data=data,
                                   sequence_length=data_length,
                                   use_sequence_length=True,
                                   value=0)

        # (batch_size,  num_hidden, seq_len)
        data = mx.sym.transpose(data, axes=(1, 2, 0))
        data_conv = mx.sym.Convolution(data=data,
                                       weight=self.conv_weight,
                                       bias=self.conv_bias,
                                       pad=padding,
                                       kernel=(self.config.kernel_width, ),
                                       num_filter=num_hidden,
                                       layout="NCW")

        # (batch_size, 2 * num_hidden, seq_len)
        if self.pad_type == C.CNN_PAD_LEFT:
            data_conv = mx.sym.slice_axis(data=data_conv,
                                          axis=2,
                                          begin=0,
                                          end=seq_len)

        return self._post_convolution(data_conv)
Beispiel #16
0
def main():
    params = argparse.ArgumentParser(
        description='Evaluate translations by calculating metrics with '
        'respect to a reference set.')
    arguments.add_evaluate_args(params)
    arguments.add_logging_args(params)
    args = params.parse_args()

    if args.quiet:
        logger.setLevel(logging.ERROR)

    utils.check_condition(args.offset >= 0, "Offset should be non-negative.")
    log_sockeye_version(logger)

    logger.info("Command: %s", " ".join(sys.argv))
    logger.info("Arguments: %s", args)

    references = [' '.join(e) for e in data_io.read_content(args.references)]
    hypotheses = [h.strip() for h in args.hypotheses]
    logger.info("%d hypotheses | %d references", len(hypotheses),
                len(references))

    if not args.not_strict:
        utils.check_condition(
            len(hypotheses) == len(references),
            "Number of hypotheses (%d) and references (%d) does not match." %
            (len(hypotheses), len(references)))

    if not args.sentence:
        scores = []
        for metric in args.metrics:
            if metric == C.BLEU:
                bleu_score = raw_corpus_bleu(hypotheses, references,
                                             args.offset)
                scores.append("%.6f" % bleu_score)
            elif metric == C.CHRF:
                chrf_score = chrf.corpus_chrf(hypotheses,
                                              references,
                                              trim_whitespaces=True)
                scores.append("%.6f" % chrf_score)
        print("\t".join(scores), file=sys.stdout)
    else:
        for h, r in zip(hypotheses, references):
            scores = []
            for metric in args.metrics:
                if metric == C.BLEU:
                    bleu = raw_corpus_bleu(h, r, args.offset)
                    scores.append("%.6f" % bleu)
                elif metric == C.CHRF:
                    chrf_score = chrf.corpus_chrf(h, r, trim_whitespaces=True)
                    scores.append("%.6f" % chrf_score)
            print("\t".join(scores), file=sys.stdout)
Beispiel #17
0
 def __init__(self,
              updates_per_checkpoint: int,
              half_life: int,
              warmup: int = 0) -> None:
     super().__init__(warmup)
     check_condition(updates_per_checkpoint > 0,
                     "updates_per_checkpoint needs to be > 0.")
     check_condition(half_life > 0, "half_life needs to be > 0.")
     # 0.5 base_lr = base_lr * sqrt(1 + T * factor)
     # then factor = 3 ./ T, with T = half_life * updates_per_checkpoint
     self.factor = 3. / (half_life * updates_per_checkpoint)
     self.t_last_log = -1
     self.log_every_t = int(half_life * updates_per_checkpoint)
Beispiel #18
0
    def load_params_from_file(self, fname: str):
        """
        Loads and sets model parameters from file.

        :param fname: Path to load parameters from.
        """
        assert self._is_built
        utils.check_condition(
            os.path.exists(fname), "No model parameter file found under %s. "
            "This is either not a model directory or the first training "
            "checkpoint has not happened yet." % fname)
        self.params, _ = utils.load_params(fname)
        logger.info('Loaded params from "%s"', fname)
Beispiel #19
0
    def __init__(self,
                 context: mx.context.Context,
                 inputs: str,
                 references: str,
                 model: str,
                 max_input_len: Optional[int] = None,
                 beam_size: int = C.DEFAULT_BEAM_SIZE,
                 bucket_width_source: int = 10,
                 length_penalty_alpha: float = 1.0,
                 length_penalty_beta: float = 0.0,
                 softmax_temperature: Optional[float] = None,
                 max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
                 ensemble_mode: str = 'linear',
                 sample_size: int = -1,
                 random_seed: int = 42) -> None:
        self.context = context
        self.max_input_len = max_input_len
        self.max_output_length_num_stds = max_output_length_num_stds
        self.ensemble_mode = ensemble_mode
        self.beam_size = beam_size
        self.batch_size = 16
        self.bucket_width_source = bucket_width_source
        self.length_penalty_alpha = length_penalty_alpha
        self.length_penalty_beta = length_penalty_beta
        self.softmax_temperature = softmax_temperature
        self.model = model
        with data_io.smart_open(inputs) as inputs_fin, data_io.smart_open(references) as references_fin:
            input_sentences = inputs_fin.readlines()
            target_sentences = references_fin.readlines()
            utils.check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match")
            if sample_size <= 0:
                sample_size = len(input_sentences)
            if sample_size < len(input_sentences):
                # custom random number generator to guarantee the same samples across runs in order to be able to
                # compare metrics across independent runs
                random_gen = random.Random(random_seed)
                self.input_sentences, self.target_sentences = zip(
                    *random_gen.sample(list(zip(input_sentences, target_sentences)),
                                       sample_size))
            else:
                self.input_sentences, self.target_sentences = input_sentences, target_sentences

        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)",
                    max_input_len if max_input_len is not None else -1,
                    beam_size, model, len(self.input_sentences))

        with data_io.smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \
                data_io.smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out:
            [trg_out.write(s) for s in self.target_sentences]
            [src_out.write(s) for s in self.input_sentences]
Beispiel #20
0
    def __init__(self,
                 config: ConvolutionalEmbeddingConfig,
                 prefix: str = C.CHAR_SEQ_ENCODER_PREFIX) -> None:
        utils.check_condition(
            len(config.num_filters) == config.max_filter_width,
            "num_filters must have max_filter_width elements.")
        self.num_embed = config.num_embed
        self.output_dim = config.output_dim
        self.max_filter_width = config.max_filter_width
        self.num_filters = config.num_filters[:]
        self.pool_stride = config.pool_stride
        self.num_highway_layers = config.num_highway_layers
        self.prefix = prefix
        self.dropout = config.dropout
        self.add_positional_encoding = config.add_positional_encoding

        self.conv_weight = {
            filter_width: mx.sym.Variable(
                "%s%s%d%s" % (self.prefix, "conv_", filter_width, "_weight"))
            for filter_width in range(1, self.max_filter_width + 1)
        }
        self.conv_bias = {
            filter_width: mx.sym.Variable(
                "%s%s%d%s" % (self.prefix, "conv_", filter_width, "_bias"))
            for filter_width in range(1, self.max_filter_width + 1)
        }

        self.project_weight = mx.sym.Variable(self.prefix + "project_weight")
        self.project_bias = mx.sym.Variable(self.prefix + "project_bias")

        self.gate_weight = [
            mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_weight"))
            for i in range(self.num_highway_layers)
        ]
        self.gate_bias = [
            mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_bias"))
            for i in range(self.num_highway_layers)
        ]

        self.transform_weight = [
            mx.sym.Variable("%s%s%d%s" %
                            (self.prefix, "transform_", i, "_weight"))
            for i in range(self.num_highway_layers)
        ]
        self.transform_bias = [
            mx.sym.Variable("%s%s%d%s" %
                            (self.prefix, "transform_", i, "_bias"))
            for i in range(self.num_highway_layers)
        ]
Beispiel #21
0
def dot_attention(queries: mx.sym.Symbol,
                  keys: mx.sym.Symbol,
                  values: mx.sym.Symbol,
                  lengths: Optional[mx.sym.Symbol] = None,
                  dropout: float = 0.0,
                  bias: Optional[mx.sym.Symbol] = None,
                  prefix: Optional[str] = ''):
    """
    Computes dot attention for a set of queries, keys, and values.

    :param queries: Attention queries. Shape: (n, lq, d).
    :param keys: Attention keys. Shape: (n, lk, d).
    :param values: Attention values. Shape: (n, lk, dv).
    :param lengths: Optional sequence lengths of the keys. Shape: (n,).
    :param dropout: Dropout probability.
    :param bias: Optional 3d bias tensor.
    :param prefix: Optional prefix
    :return: 'Context' vectors for each query. Shape: (n, lq, dv), 'probs' vector for each query (n, lq, lk).
    """
    utils.check_condition(
        lengths is not None or bias is not None,
        "Must provide either length or bias argument for masking")

    # (n, lq, lk)
    logits = mx.sym.batch_dot(lhs=queries,
                              rhs=keys,
                              transpose_b=True,
                              name='%sdot' % prefix)

    if lengths is not None:
        # mask lk dimension
        # (lk, n, lq)
        logits = mx.sym.transpose(data=logits, axes=(2, 0, 1))
        logits = mx.sym.SequenceMask(data=logits,
                                     use_sequence_length=True,
                                     sequence_length=lengths,
                                     value=C.LARGE_NEGATIVE_VALUE)
        # (n, lq, lk)
        logits = mx.sym.transpose(data=logits, axes=(1, 2, 0))

    if bias is not None:
        logits = mx.sym.broadcast_add(logits, bias, name='%sbias_add' % prefix)

    probs = mx.sym.softmax(logits, axis=-1)
    probs = mx.sym.Dropout(probs, p=dropout) if dropout > 0.0 else probs

    # (n, lq, lk) x (n, lk, dv) -> (n, lq, dv)
    res = mx.sym.batch_dot(lhs=probs, rhs=values, name='%scontexts' % prefix)
    return res, probs
Beispiel #22
0
def determine_decode_and_evaluate_context(
        args: argparse.Namespace, exit_stack: ExitStack,
        train_context: List[mx.Context]) -> Tuple[int, Optional[mx.Context]]:
    """
    Determine the number of sentences to decode and the context we should run on (CPU or GPU).

    :param args: Arguments as returned by argparse.
    :param exit_stack: An ExitStack from contextlib.
    :param train_context: Context for training.
    :return: The number of sentences to decode and a list with the context(s) to run on.
    """
    num_to_decode = args.decode_and_evaluate
    if args.optimized_metric == C.BLEU and num_to_decode == 0:
        logger.info(
            "You chose BLEU as the optimized metric, will turn on BLEU monitoring during training. "
            "To control how many validation sentences are used for calculating bleu use "
            "the --decode-and-evaluate argument.")
        num_to_decode = -1

    if num_to_decode == 0:
        return 0, None

    if args.use_cpu or args.decode_and_evaluate_use_cpu:
        context = mx.cpu()
    elif args.decode_and_evaluate_device_id is not None:
        # decode device is defined from the commandline
        num_gpus = utils.get_num_gpus()
        check_condition(
            num_gpus >= 1,
            "No GPUs found, consider running on the CPU with --use-cpu "
            "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi "
            "binary isn't on the path).")

        if args.disable_device_locking:
            context = utils.expand_requested_device_ids(
                [args.decode_and_evaluate_device_id])
        else:
            context = exit_stack.enter_context(
                utils.acquire_gpus([args.decode_and_evaluate_device_id],
                                   lock_dir=args.lock_dir))
        context = mx.gpu(context[0])

    else:
        # default decode context is the last training device
        context = train_context[-1]

    logger.info("Decode and Evaluate Device(s): %s", context)
    return num_to_decode, context
Beispiel #23
0
def get_coverage(config: CoverageConfig) -> 'Coverage':
    """
    Returns a Coverage instance.

    :param config: Coverage configuration.
    :return: Instance of Coverage.
    """
    if config.type == 'count':
        utils.check_condition(config.num_hidden == 1, "Count coverage requires coverage_num_hidden==1")
    if config.type == "gru":
        return GRUCoverage(config.num_hidden, config.layer_normalization)
    elif config.type in {"tanh", "sigmoid", "relu", "softrelu"}:
        return ActivationCoverage(config.num_hidden, config.type, config.layer_normalization)
    elif config.type == "count":
        return CountCoverage()
    else:
        raise ValueError("Unknown coverage type %s" % config.type)
Beispiel #24
0
    def _get_embed_weights(
            self) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, mx.sym.Symbol]:
        """
        Returns embedding parameters for source and target.

        :return: Tuple of source and target parameter symbols.
        """
        assert isinstance(self.config.config_embed_source,
                          encoder.EmbeddingConfig)
        assert isinstance(self.config.config_embed_target,
                          encoder.EmbeddingConfig)
        w_embed_source = mx.sym.Variable(
            C.SOURCE_EMBEDDING_PREFIX + "weight",
            shape=(self.config.config_embed_source.vocab_size,
                   self.config.config_embed_source.num_embed))
        w_embed_target = mx.sym.Variable(
            C.TARGET_EMBEDDING_PREFIX + "weight",
            shape=(self.config.config_embed_target.vocab_size,
                   self.config.config_embed_target.num_embed))
        w_out_target = mx.sym.Variable("target_output_weight",
                                       shape=(self.config.output_layer_size,
                                              self.decoder.get_num_hidden()))

        if self.config.weight_tying:
            if C.WEIGHT_TYING_SRC in self.config.weight_tying_type \
                    and C.WEIGHT_TYING_TRG in self.config.weight_tying_type:
                logger.info("Tying the source and target embeddings.")
                w_embed_source = w_embed_target = mx.sym.Variable(
                    C.SHARED_EMBEDDING_PREFIX + "weight",
                    shape=(self.config.config_embed_source.vocab_size,
                           self.config.config_embed_source.num_embed))

            if C.WEIGHT_TYING_SOFTMAX in self.config.weight_tying_type:
                logger.info(
                    "Tying the target embeddings and output layer parameters.")
                utils.check_condition(
                    self.config.config_embed_target.num_embed ==
                    self.decoder.get_num_hidden(),
                    "Weight tying requires target embedding size and decoder hidden size "
                    + "to be equal: %d vs. %d" %
                    (self.config.config_embed_target.num_embed,
                     self.decoder.get_num_hidden()))
                w_out_target = w_embed_target

        return w_embed_source, w_embed_target, w_out_target
Beispiel #25
0
    def get_encodings(length, depth) -> np.ndarray:
        utils.check_condition(
            depth % 2 == 0,
            "Positional embeddings require an even embedding size it "
            "is however %d." % depth)
        # (1, depth)
        channels = np.arange(depth // 2).reshape((1, -1))

        # (length, 1)
        positions = np.arange(0, length).reshape((-1, 1))
        scaled_positions = positions / np.power(10000, (2 * channels) / depth)
        # sinusoids:
        sin = np.sin(scaled_positions)
        # cosines:
        cos = np.cos(scaled_positions)
        # interleave: (1, length, num_embed)
        encodings = np.hstack([sin, cos]).reshape(1, length, depth)
        return encodings
Beispiel #26
0
 def __init__(self, schedule: List[Tuple[float, int]],
              updates_per_checkpoint: int) -> None:
     super().__init__()
     check_condition(all(num_updates > 0 for (_, num_updates) in schedule),
                     "num_updates for each step should be > 0.")
     check_condition(
         all(num_updates % updates_per_checkpoint == 0
             for (_, num_updates) in schedule),
         "num_updates for each step should be divisible by updates_per_checkpoint."
     )
     self.schedule = schedule
     self.current_step = 0
     self.current_rate = 0.
     self.current_step_num_updates = 0
     self.current_step_started_at = 0
     self.next_step_at = 0
     self.latest_t = 0
     self._update_rate(self.current_step)
Beispiel #27
0
    def __init__(self,
                 reduce_factor: float,
                 reduce_num_not_improved: int,
                 warmup: int = 0) -> None:
        super().__init__(warmup)
        check_condition(0.0 < reduce_factor <= 1,
                        "reduce_factor should be in ]0,1].")
        self.reduce_factor = reduce_factor
        self.reduce_num_not_improved = reduce_num_not_improved
        self.num_not_improved = 0

        self.lr = None  # type: float
        self.t_last_log = -1
        self.warmed_up = not self.warmup > 0
        logger.info(
            "Will reduce the learning rate by a factor of %.2f whenever"
            " the validation score doesn't improve %d times.", reduce_factor,
            reduce_num_not_improved)
Beispiel #28
0
 def _check_dist_kvstore_requirements(self, lr_decay_opt_states_reset, lr_decay_param_reset, optimizer):
     # In distributed training the optimizer will run remotely. For eve we however need to pass information about
     # the loss, which is not possible anymore by means of accessing self.module._curr_module._optimizer.
     utils.check_condition(optimizer != C.OPTIMIZER_EVE, "Eve optimizer not supported with distributed training.")
     utils.check_condition(not issubclass(type(self.lr_scheduler), lr_scheduler.AdaptiveLearningRateScheduler),
                           "Adaptive learning rate schedulers not supported with a dist kvstore. "
                           "Try a fixed schedule such as %s." % C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T)
     utils.check_condition(not lr_decay_param_reset, "Parameter reset when the learning rate decays not "
                                                     "supported with distributed training.")
     utils.check_condition(not lr_decay_opt_states_reset, "Optimizer state reset when the learning rate decays "
                                                          "not supported with distributed training.")
Beispiel #29
0
 def __init__(
     self,
     batch_size: int,
     output_folder: str,
     optimized_metric: str = C.PERPLEXITY,
     use_tensorboard: bool = False,
     cp_decoder: Optional[checkpoint_decoder.CheckpointDecoder] = None
 ) -> None:
     self.output_folder = output_folder
     # stores dicts of metric names & values for each checkpoint
     self.metrics = []  # type: List[Dict]
     self.metrics_filename = os.path.join(output_folder, C.METRICS_NAME)
     self.best_checkpoint = 0
     self.start_tic = time.time()
     self.summary_writer = None
     if use_tensorboard:
         import tensorboard  # pylint: disable=import-error
         log_dir = os.path.join(output_folder, C.TENSORBOARD_NAME)
         if os.path.exists(log_dir):
             logger.info("Deleting existing tensorboard log dir %s",
                         log_dir)
             shutil.rmtree(log_dir)
         logger.info("Logging training events for Tensorboard at '%s'",
                     log_dir)
         self.summary_writer = tensorboard.FileWriter(log_dir)
     self.cp_decoder = cp_decoder
     self.ctx = mp.get_context('spawn')  # type: ignore
     self.decoder_metric_queue = self.ctx.Queue()
     self.decoder_process = None  # type: Optional[mp.Process]
     # TODO(fhieber): MXNet Speedometer uses root logger. How to fix this?
     self.speedometer = mx.callback.Speedometer(
         batch_size=batch_size,
         frequent=C.MEASURE_SPEED_EVERY,
         auto_reset=False)
     utils.check_condition(optimized_metric in C.METRICS,
                           "Unsupported metric: %s" % optimized_metric)
     if optimized_metric == C.BLEU:
         utils.check_condition(self.cp_decoder is not None,
                               "%s requires CheckpointDecoder" % C.BLEU)
     self.optimized_metric = optimized_metric
     self.validation_best = C.METRIC_WORST[self.optimized_metric]
     logger.info("Early stopping by optimizing '%s'", self.optimized_metric)
     self.tic = 0
Beispiel #30
0
    def __init__(self,
                 prefix: str,
                 depth_att: int = 512,
                 heads: int = 8,
                 depth_out: int = 512,
                 dropout: float = 0.0) -> None:
        self.prefix = prefix
        utils.check_condition(
            depth_att % heads == 0,
            "Number of heads (%d) must divide attention depth (%d)" %
            (heads, depth_att))
        self.depth = depth_att
        self.heads = heads
        self.depth_out = depth_out
        self.dropout = dropout
        self.depth_per_head = self.depth // self.heads

        self.w_h2o = mx.sym.Variable("%sh2o_weight" % prefix)
        self.b_h2o = mx.sym.Variable("%sh2o_bias" % prefix)