Beispiel #1
0
def finalize_config(config, override=None, options=None):
    """Finalizes the configuration with possible override and options."""
    if config is not None and (override or options):
        config = copy.deepcopy(config)
        if override:
            config_util.merge_config(config, override)
        if options:
            config_util.update_config_with_options(config, options)
    return config
    def process_input(self,
                      source,
                      target=None,
                      target_name=None,
                      metadata=None,
                      config=None,
                      options=None):
        """Processes one translation example at inference.

        Args:
          source: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target_name: The name of the target that is passed during inference.
          metadata: Additional metadata of the input.
          config: A configuration override for this example.
          options: A dictionary with operators options.

        Returns:
          - In preprocess, a tuple (source_tokens, target_tokens, metadata).
          - In postprocess, a string (the postprocessed target)
        """
        # This method should be thread-safe as the inference server is starting a new
        # thread for each request.

        # Rebuild pipeline if the example has its own configuration.
        if config:
            if config_util.is_v2_config(self._config):
                raise ValueError("Configuration override is not supported for V2 "
                                 "configurations")
            config = config_util.merge_config(copy.deepcopy(self._config), config)
            pipeline = self.build_pipeline(config)
        else:
            pipeline = self._pipeline

        tu = TranslationUnit(
            source=source,
            metadata=metadata,
            source_tokenizer=pipeline.start_state.get('src_tokenizer'),
        )

        if target is not None:
            tu.add_target(
                target,
                name=target_name,
                tokenizer=pipeline.start_state.get('tgt_tokenizer'))

        tu_batch = ([tu], {})
        tu_batch = pipeline(tu_batch, options=options)
        tu = tu_batch[0][0]

        if self._postprocess:
            return tu.tgt_detok
        src_tokens = tu.src_tok.tokens
        subprocess.Popen(["echo", "************"])
        subprocess.Popen(["echo", str(src_tokens)])
        # pdb.set_trace()
        tgt_tokens = tu.tgt_tok.tokens if tu.tgt_tok is not None else [None for _ in src_tokens]
        return src_tokens, tgt_tokens, tu.metadata
Beispiel #3
0
def get_operator_params(config, override_label=None):
    """Returns the operator parameters from the configuration."""
    config = copy.deepcopy(config)
    config.pop("op", None)
    override_config = config.pop("overrides", None)
    # TODO: implement multiple override labels per batch/corpus.
    if override_config and override_label and override_label in override_config:
        override_config = override_config[override_label]
        config = merge_config(config, override_config)
    return config
def finalize_config(config, override=None, options=None):
    """Finalizes the configuration with possible override and options."""
    if config is None:
        supported_features = None
    else:
        supported_features = config.get('supported_features')
        if config_util.is_v2_config(config):
            if override:
                raise ValueError(
                    "Configuration override is not supported for V2 "
                    "configurations")
            if options:
                options = config_util.read_options(config, options)
            config = None
        else:
            if override or options:
                config = copy.deepcopy(config)
                if override:
                    config_util.merge_config(config, override)
                if options:
                    config_util.read_options(config, options)
                    options = None
    return config, options, supported_features
def get_operator_params(config, operator_type, override_label=None):
    """Returns the operator parameters from the configuration."""
    config = copy.deepcopy(config)
    config.pop("op", None)
    override_config = config.pop("overrides", None)

    if override_label and override_config:
        override = [
            label for label in override_label if label in override_config
        ]
        override_num = len(override)
        if override_num > 1:
            raise RuntimeError(
                "One corpus requires different overrides (%s) for the same operator (%s)."
                % (override_config, operator_type))
        if override_num == 1:
            override = override[0]
            override_config = override_config[override]
            config = merge_config(config, override_config)
    return config
def build_operator(
    operator_type,
    operator_cls,
    operator_params,
    global_config,
    process_type,
    build_state,
    index,
    shared_state=None,
    inference_config=None,
):
    """Creates an operator instance from its configuration."""

    # Propagate source and target languages
    _add_lang_info(operator_params, global_config, "source")
    _add_lang_info(operator_params, global_config, "target")

    args = []
    if shared_state:
        args.append(shared_state)
    name = operator_params.get("name", "%s_%d" % (operator_type, index + 1))
    if inference_config:
        op_inference_config = inference_config.get(name)
        if op_inference_config:
            operator_params = merge_config(operator_params,
                                           op_inference_config)
    if process_type == ProcessType.TRAINING:
        operator_cls.validate_parameters(operator_params, name)
    logger.debug("Building operator %s", name)
    operator = operator_cls(operator_params, process_type, build_state, *args)
    # We set common private attributes here so that operators do not need to call
    # the base constructor.
    operator._name = name
    operator._verbose = operator_params.get("verbose", False)
    operator._process_type = process_type
    return operator
Beispiel #7
0
    def exec_function(self, args):
        """Main entrypoint."""
        if self._config is None and self._model is None:
            self.parser.error(
                'at least one of --config or --model options must be set')

        config = self._config or {}
        parent_model = self._model or config.get('model')
        if parent_model is not None and not self._stateless:
            # Download model locally and merge the configuration.
            remote_model_path = self._storage.join(self._model_storage_read,
                                                   parent_model)
            model_path = os.path.join(self._models_dir, parent_model)
            model_config = utility.fetch_model(self._storage,
                                               remote_model_path, model_path,
                                               should_check_integrity)
            if 'modelType' not in model_config:
                if parent_model.endswith('_release'):
                    model_config['modelType'] = 'release'
                else:
                    model_config['modelType'] = 'checkpoint'
            config = config_util.merge_config(copy.deepcopy(model_config),
                                              config)
        else:
            model_path = None
            model_config = None

        if args.cmd == 'train':
            if parent_model is not None and config['modelType'] not in (
                    'checkpoint', 'base', 'preprocess'):
                raise ValueError(
                    'cannot train from a model that is not a training checkpoint, '
                    'a base model, or a preprocess model')
            return self.train_wrapper(self._task_id,
                                      config,
                                      self._storage,
                                      self._model_storage_write,
                                      self._image,
                                      parent_model=parent_model,
                                      model_path=model_path,
                                      model_config=model_config,
                                      gpuid=self._gpuid,
                                      push_model=not self._no_push)
        elif args.cmd == 'buildvocab':
            self.build_vocab(self._task_id,
                             config,
                             self._storage,
                             self._model_storage_write,
                             self._image,
                             push_model=not self._no_push)
        elif args.cmd == 'trans':
            if not self._stateless and (parent_model is None or
                                        config['modelType'] != 'checkpoint'):
                raise ValueError('translation requires a training checkpoint')
            return self.trans_wrapper(
                config,
                model_path,
                self._storage,
                args.input,
                args.output,
                as_release=args.as_release,
                release_optimization_level=args.release_optimization_level,
                gpuid=self._gpuid,
                copy_source=args.copy_source,
                add_bt_tag=args.add_bt_tag,
                no_postprocess=args.no_postprocess)
        elif args.cmd == 'release':
            if not self._stateless and (parent_model is None or
                                        config['modelType'] != 'checkpoint'):
                raise ValueError('releasing requires a training checkpoint')
            if args.destination is None:
                args.destination = self._model_storage_write
            self.release_wrapper(config,
                                 model_path,
                                 self._image,
                                 storage=self._storage,
                                 destination=args.destination,
                                 optimization_level=args.optimization_level,
                                 gpuid=self._gpuid,
                                 push_model=not self._no_push)
        elif args.cmd == 'serve':
            if (not self._stateless and
                (parent_model is None
                 or config['modelType'] not in ('checkpoint', 'release'))):
                raise ValueError(
                    'serving requires a training checkpoint or a released model'
                )
            if config['modelType'] == 'checkpoint':
                model_path = self.release_wrapper(
                    config,
                    model_path,
                    self._image,
                    local_destination=self._output_dir,
                    optimization_level=args.release_optimization_level,
                    gpuid=self._gpuid,
                    push_model=False)
                config = utility.load_model_config(model_path)
            self.serve_wrapper(config,
                               model_path,
                               args.host,
                               args.port,
                               gpuid=self._gpuid)
        elif args.cmd == 'preprocess':
            if not args.build_model:
                self.preprocess(config, self._storage)
            else:
                if parent_model is not None and config['modelType'] not in (
                        'checkpoint', 'base'):
                    raise ValueError(
                        'cannot preprocess from a model that is not a training '
                        'checkpoint or a base model')
                return self.preprocess_into_model(self._task_id,
                                                  config,
                                                  self._storage,
                                                  self._model_storage_write,
                                                  self._image,
                                                  parent_model=parent_model,
                                                  model_path=model_path,
                                                  model_config=model_config,
                                                  push_model=not self._no_push)
Beispiel #8
0
def test_key_override():
    a = {"a": {"b": 42, "c": "d"}, "e": "f"}
    b = {"a": None}
    c = config.merge_config(a, b)
    assert c == {"a": None, "e": "f"}
Beispiel #9
0
def test_key_replace():
    a = {"a": {"b": 42, "c": "d"}, "e": "f"}
    b = {"e": {"x": "y"}}
    c = config.merge_config(a, b)
    assert c == {"a": {"b": 42, "c": "d"}, "e": {"x": "y"}}
Beispiel #10
0
def preprocess_example(preprocessor,
                       index,
                       raw_example,
                       config=None,
                       config_override=None):
    """Applies preprocessing function on example."""
    if not isinstance(raw_example, dict):
        raise InvalidRequest("example %d is not a JSON object" % index)
    source_text = raw_example.get("text")
    if source_text is None:
        raise InvalidRequest("missing text field in example %d" % index)
    mode = raw_example.get("mode", "default")

    options = None
    example_config_override = raw_example.get("config")

    # Resolve example options.
    if config is not None:
        example_options = raw_example.get("options")
        if example_options:
            options_or_override = config_util.read_options(
                config, example_options)
            if config_util.is_v2_config(config):
                options = options_or_override
            else:
                example_config_override = config_util.merge_config(
                    example_config_override or {}, options_or_override)

    # Merge example-level config override into batch-level config override.
    if example_config_override:
        if config_override:
            config_override = config_util.merge_config(
                copy.deepcopy(config_override), example_config_override)
        else:
            config_override = example_config_override

    target_prefix = raw_example.get("target_prefix")
    target_fuzzy = raw_example.get("fuzzy")
    if target_prefix is not None and target_fuzzy is not None:
        raise InvalidRequest(
            "Using both a target prefix and a fuzzy target is currently unsupported"
        )

    target_text = None
    target_name = None
    if target_prefix is not None:
        target_text = target_prefix
    elif target_fuzzy is not None:
        supported_features = config.get(
            "supported_features") if config else None
        if supported_features is not None and supported_features.get(
                "NFA", False):
            target_text = target_fuzzy
            target_name = "fuzzy"
        else:
            logger.warning(
                "The fuzzy target is ignored because this model does not "
                "support Neural Fuzzy Adaptation")

    if preprocessor is None:
        source_tokens = source_text
        target_tokens = None
        metadata = None
    else:
        source_tokens, target_tokens, metadata = preprocessor.process_input(
            source_text,
            target=target_text,
            target_name=target_name,
            config=config_override,
            options=options,
        )

    # Move to the general multiparts representation.
    if not source_tokens or not isinstance(source_tokens[0], list):
        source_tokens = [source_tokens]
        target_tokens = [target_tokens]
        metadata = [metadata]

    return TranslationExample(
        index=index,
        config=config_override,
        options=options,
        source_tokens=source_tokens,
        target_tokens=target_tokens,
        mode=mode,
        metadata=metadata,
    )
def preprocess_example(preprocessor,
                       index,
                       raw_example,
                       config=None,
                       config_override=None):
    """Applies preprocessing function on example."""
    if not isinstance(raw_example, dict):
        raise ValueError('example %d is not a JSON object' % index)
    source_text = raw_example.get('text')
    if source_text is None:
        raise ValueError('missing text field in example %d' % index)
    mode = raw_example.get('mode', 'default')

    example_config_override = raw_example.get('config')
    if example_config_override:
        if config_override:
            config_override = config_util.merge_config(
                copy.deepcopy(config_override), example_config_override)
        else:
            config_override = example_config_override
    config, options, supported_features = finalize_config(
        config, override=config_override, options=raw_example.get('options'))

    target_prefix = raw_example.get('target_prefix')
    target_fuzzy = raw_example.get('fuzzy')
    if target_prefix is not None and target_fuzzy is not None:
        raise ValueError(
            "Using both a target prefix and a fuzzy target is currently unsupported"
        )

    target_text = None
    target_name = None
    if target_prefix is not None:
        target_text = target_prefix
    elif target_fuzzy is not None:
        if supported_features is not None and supported_features.get(
                "NFA", False):
            target_text = target_fuzzy
            target_name = "fuzzy"
        else:
            logger.warning(
                "The fuzzy target is ignored because this model does not "
                "support Neural Fuzzy Adaptation")

    if preprocessor is None:
        source_tokens = source_text
        target_tokens = None
        metadata = None
    else:
        source_tokens, target_tokens, metadata = preprocessor.process_input(
            source_text,
            target=target_text,
            target_name=target_name,
            config=config,
            options=options,
        )

    # Move to the general multiparts representation.
    if not source_tokens or not isinstance(source_tokens[0], list):
        source_tokens = [source_tokens]
        target_tokens = [target_tokens]
        metadata = [metadata]

    return TranslationExample(index=index,
                              config=config,
                              options=options,
                              source_tokens=source_tokens,
                              target_tokens=target_tokens,
                              mode=mode,
                              metadata=metadata)
 def handle_request(self, request):
     if 'src' not in request:
         self.send_error(400, 'missing src field')
         return
     results = {'tgt': []}
     if not request['src']:
         self.send_result(results)
         return
     if not isinstance(request['src'], list):
         self.send_error(400, 'src field must be a list')
         return
     timeout = global_timeout
     max_batch_size = global_max_batch_size
     batch_config = config
     request_options = request.get('options')
     if request_options is not None and isinstance(
             request_options, dict):
         timeout = request_options.get('timeout', timeout)
         max_batch_size = request_options.get('max_batch_size',
                                              max_batch_size)
         if 'config' in request_options:
             batch_config = config_util.merge_config(
                 copy.deepcopy(config), request['options']['config'])
     extra_config = []
     batch_metadata = []
     batch_offsets = []
     batch_tokens = []
     offset = 0
     for src in request['src']:
         local_config = batch_config
         if 'config' in src or 'options' in src:
             local_config = copy.deepcopy(local_config)
             if 'config' in src:
                 local_config = config_util.merge_config(
                     local_config, src['config'])
             if 'options' in src:
                 try:
                     config_util.update_config_with_options(
                         local_config, src['options'])
                 except ValueError as e:
                     self.send_error(400, e.message)
                     return
         data = preprocess_fn(serving_state, src['text'], local_config)
         # Preprocessing may return additional metadata.
         if isinstance(data, tuple):
             tokens, metadata = data
         else:
             tokens, metadata = data, None
         # Preprocessing may split input text into multiple parts.
         if tokens and isinstance(tokens[0], list):
             size = len(tokens)
             # Flatten the parts in the batch collection.
             batch_tokens.extend(tokens)
             batch_metadata.extend(metadata)
         else:
             size = 1
             batch_tokens.append(tokens)
             batch_metadata.append(metadata)
         extra_config.append(local_config)
         batch_offsets.append((offset, offset + size))
         offset += size
     if max_batch_size is not None and len(
             batch_tokens) > max_batch_size:
         offset = 0
         batch_hypotheses = []
         while offset < len(batch_tokens):
             lower_bound = offset
             upper_bound = min(offset + max_batch_size,
                               len(batch_tokens))
             batch_hypotheses.extend(
                 translate_fn(batch_tokens[lower_bound:upper_bound],
                              backend_info,
                              timeout=timeout))
             offset = upper_bound
     else:
         batch_hypotheses = translate_fn(batch_tokens,
                                         backend_info,
                                         timeout=timeout)
     if batch_hypotheses is None:
         self.send_error(504, 'translation request timed out')
         return
     for local_config, offset in zip(extra_config, batch_offsets):
         hypotheses = batch_hypotheses[offset[0]:offset[1]]
         num_parts = offset[1] - offset[0]
         num_hypotheses = len(hypotheses[0])
         src_tokens = batch_tokens[offset[0]:offset[1]]
         src_metadata = batch_metadata[offset[0]:offset[1]]
         result = []
         for h in range(num_hypotheses):
             if num_parts == 1:
                 src = src_tokens[0]
                 if src_metadata[0] is not None:
                     src = (src, src_metadata[0])
                 tgt = hypotheses[0][h].output
                 scores = hypotheses[0][h].score
                 attention = hypotheses[0][h].attention
             else:
                 # For multi parts inputs, send all result parts to the postprocessing.
                 src = (src_tokens, src_metadata)
                 tgt = []
                 scores = []
                 attention = None
                 for j in range(num_parts):
                     tgt.append(hypotheses[j][h].output)
                     scores.append(hypotheses[j][h].score)
             result.append(
                 _build_result(lambda src, tgt: postprocess_fn(
                     serving_state, src, tgt, local_config),
                               src,
                               tgt,
                               scores=scores,
                               attention=attention,
                               num_parts=num_parts))
         results['tgt'].append(result)
     self.send_result(results)
Beispiel #13
0
    def process_input(
        self,
        source,
        target=None,
        target_name=None,
        metadata=None,
        config=None,
        options=None,
    ):
        """Processes one translation example at inference.

        Args:
          source: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target: In preprocess, a string. In postprocess, a (possibly multipart)
            list of tokens.
          target_name: The name of the target that is passed during inference.
          metadata: Additional metadata of the input.
          config: A configuration override for this example.
          options: A dictionary with operators options.

        Returns:
          - In preprocess, a tuple (source_tokens, target_tokens, metadata).
          - In postprocess, a string (the postprocessed target)
        """
        # This method should be thread-safe as the inference server is starting a new
        # thread for each request.

        # Rebuild pipeline if the example has its own configuration.
        if config:
            if config_util.is_v2_config(self._config):
                raise ValueError(
                    "Configuration override is not supported for V2 "
                    "configurations")
            config = config_util.merge_config(copy.deepcopy(self._config),
                                              config)
            pipeline = self.build_pipeline(config)
        else:
            pipeline = self._pipeline

        tu = TranslationUnit(
            source=source,
            metadata=metadata,
            source_tokenizer=pipeline.start_state.get("src_tokenizer"),
        )

        proc = "Postprocess" if self._postprocess else "Preprocess"
        logger.debug("[%d] %s source input:  %s",
                     threading.current_thread().ident, proc, source)

        if target is not None:
            tu.add_target(
                target,
                name=target_name,
                tokenizer=pipeline.start_state.get("tgt_tokenizer"),
            )
            logger.debug(
                "[%d] %s target input:  %s",
                threading.current_thread().ident,
                proc,
                target,
            )

        tu_batch = ([tu], {})
        tu_batch = pipeline(tu_batch, options=options)
        tu = tu_batch[0][0]

        if self._postprocess:
            logger.debug(
                "[%d] %s target output:  %s",
                threading.current_thread().ident,
                proc,
                tu.tgt_detok,
            )
            return tu.tgt_detok
        src_tokens = tu.src_tok.tokens
        tgt_tokens = (tu.tgt_tok.tokens if tu.tgt_tok is not None else
                      [None for _ in src_tokens])
        logger.debug(
            "[%d] %s source output:  %s",
            threading.current_thread().ident,
            proc,
            src_tokens,
        )
        if tu.tgt_tok is not None:
            logger.debug(
                "[%d] %s target output:  %s",
                threading.current_thread().ident,
                proc,
                tgt_tokens,
            )
        return src_tokens, tgt_tokens, tu.metadata