def test_api(temp_output_dir, train_opts, nuqe_opts, atol):
    from kiwi import train, load_model

    train_opts.model = 'nuqe'
    train_opts.checkpoint_keep_only_best = 1
    all_opts = merge_namespaces(train_opts, nuqe_opts)

    config_file = Path(temp_output_dir, 'config.yaml')
    save_config_file(all_opts, config_file)

    train_run_info = train(config_file)

    predicter = load_model(train_run_info.model_path)

    examples = {
        constants.SOURCE: open(nuqe_opts.test_source).readlines(),
        constants.TARGET: open(nuqe_opts.test_target).readlines(),
        constants.ALIGNMENTS: open(nuqe_opts.test_alignments).readlines(),
    }

    predictions = predicter.predict(examples, batch_size=train_opts.batch_size)

    predictions = predictions[constants.TARGET_TAGS]
    avg_of_avgs = np.mean(list(map(np.mean, predictions)))
    max_prob = max(map(max, predictions))
    min_prob = min(map(min, predictions))
    np.testing.assert_allclose(avg_of_avgs, 0.572441, atol=atol)
    assert 0 <= min_prob <= avg_of_avgs <= max_prob <= 1
Ejemplo n.º 2
0
    def inference(self, options, call=[]):
        """ Returns a list of predictions.
        A list of lists, number for each word in sentence."""
        data = options['dataset'].blind.data

        kiwi_config = kb.load_kiwi_config(options['quetch_test'])
        pred = kiwi.load_model(kiwi_config['load-model'])
        src = [' '.join(sent.src) for sent in data]
        tgt = [' '.join(sent.tgt) for sent in data]
        alg = [
            ' '.join((f'{alg[0]}-{alg[1]}' for alg in sent.alignment))
            for sent in data
        ]
        examples = {
            kiwi.constants.SOURCE: src,
            kiwi.constants.TARGET: tgt,
            kiwi.constants.ALIGNMENTS: alg
        }

        print("Doing some inference on my blind data of size:",
              len(options['dataset'].blind.data))
        predictions = pred.predict(examples)
        return predictions['tags']
Ejemplo n.º 3
0
    def setup(self, parameters):
        #self.params={}
        self.translate_params['train'] = TRAIN_DEFAULTS
        self.translate_params['model'] = {}

        self.translate_params['train']['seed'] = int(parameters['seed'])
        self.translate_params['train']['model_type'] = parameters['model_type']
        self.translate_params['train']['patience'] = int(
            parameters['patience'])
        self.translate_params['train']['max_epochs'] = int(
            parameters['max_epochs'])
        #self.translate_params['train']['eval_freq']=int(parameters['eval_freq'])
        # NOTE: Disable validation during finetuning
        self.translate_params['train']['eval_freq'] = -1
        #self.translate_params['train']['eval_metrics']=parameters['eval_metrics']
        #self.translate_params['train']['eval_metrics']=None
        #self.translate_params['train']['eval_filters']=parameters['eval_filters']
        #self.translate_params['train']['eval_filters']=None
        #self.translate_params['train']['eval_beam']=int(parameters['eval_beam'])
        #self.translate_params['train']['eval_beam']=None
        #self.translate_params['train']['eval_batch_size']=int(parameters['eval_batch_size'])
        #self.translate_params['train']['eval_batch_size']=None
        self.translate_params['train']['save_best_metrics'] = parameters[
            'save_best_metrics']
        self.translate_params['train']['eval_max_len'] = int(
            parameters['eval_max_len'])
        self.translate_params['train']['checkpoint_freq'] = int(
            parameters['checkpoint_freq'])
        #self.translate_params['train']['n_checkpoints']=parameters['n_checkpoints']
        self.translate_params['train']['l2_reg'] = int(parameters['l2_reg'])
        self.translate_params['train']['lr_decay'] = parameters['lr_decay']
        self.translate_params['train']['lr_decay_revert'] = parameters[
            'lr_decay_revert']
        self.translate_params['train']['lr_decay_factor'] = parameters[
            'lr_decay_factor']
        self.translate_params['train']['lr_decay_patience'] = int(
            parameters['lr_decay_patience'])
        self.translate_params['train']['gclip'] = int(parameters['gclip'])
        self.translate_params['train']['optimizer'] = parameters['optimizer']
        self.translate_params['train']['lr'] = parameters['lr']
        self.translate_params['train']['batch_size'] = int(
            parameters['batch_size'])
        self.translate_params['train']['save_optim_state'] = False

        self.translate_params['train']['save_path'] = Path(
            "/not/used/because/beat_platform")
        #self.translate_params['train']['tensorboard_dir']="/lium/users/barrault/llmt/tensorboard"

        self.translate_params['model']['att_type'] = parameters['att_type']
        self.translate_params['model']['att_bottleneck'] = parameters[
            'att_bottleneck']
        self.translate_params['model']['enc_dim'] = int(parameters['enc_dim'])
        self.translate_params['model']['dec_dim'] = int(parameters['dec_dim'])
        self.translate_params['model']['emb_dim'] = int(parameters['emb_dim'])
        self.translate_params['model']['dropout_emb'] = parameters[
            'dropout_emb']
        self.translate_params['model']['dropout_ctx'] = parameters[
            'dropout_ctx']
        self.translate_params['model']['dropout_out'] = parameters[
            'dropout_out']
        self.translate_params['model']['n_encoders'] = int(
            parameters['n_encoders'])
        self.translate_params['model']['tied_emb'] = parameters['tied_emb']
        self.translate_params['model']['dec_init'] = parameters['dec_init']
        self.translate_params['model']['bucket_by'] = "src"
        if parameters['max_len'] == "None":
            self.translate_params['model']['max_len'] = None
        else:
            self.translate_params['model']['max_len'] = int(
                parameters['max_len'])
        self.translate_params['model']['direction'] = "src:Text -> trg:Text"

        self.adapted_translate_params = copy.deepcopy(self.translate_params)

        self.qe_model = None
        if parameters['direction'][11:13] == 'de':
            # load EN-DE model
            kiwi_path = os.path.abspath(
                os.path.join(
                    os.pardir,
                    'openkiwi/trained_models/estimator_en_de.torch/estimator_en_de.torch'
                ))
            self.qe_model = kiwi.load_model(kiwi_path)

        return True
Ejemplo n.º 4
0
def run(ModelClass, output_dir, pipeline_options, model_options, splits):
    model_name = getattr(ModelClass, 'title', ModelClass.__name__)
    logger.info('Jackknifing with the {} model'.format(model_name))

    # Data
    fieldset = ModelClass.fieldset(
        wmt18_format=model_options.__dict__.get('wmt18_format'))
    train_set, dev_set = train.retrieve_datasets(fieldset, pipeline_options,
                                                 model_options, output_dir)

    test_set = None
    try:
        test_set = build_test_dataset(fieldset, **vars(pipeline_options))
    except ValueError:
        pass
    except FileNotFoundError:
        pass

    device_id = None
    if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0:
        device_id = pipeline_options.gpu_id

    parent_dir = output_dir
    train_predictions = defaultdict(list)
    dev_predictions = defaultdict(list)
    test_predictions = defaultdict(list)
    splitted_datasets = cross_split_dataset(train_set, splits)
    for i, (train_fold, pred_fold) in enumerate(splitted_datasets):

        run_name = 'train_split_{}'.format(i)
        output_dir = Path(parent_dir, run_name)
        output_dir.mkdir(parents=True, exist_ok=True)
        # options.output_dir = str(options.output_dir)

        # Train
        vocabs = utils.fields_to_vocabs(train_fold.fields)

        tracking_run = tracking_logger.start_nested_run(run_name=run_name)
        with tracking_run:
            train.setup(
                output_dir=output_dir,
                seed=pipeline_options.seed,
                gpu_id=pipeline_options.gpu_id,
                debug=pipeline_options.debug,
                quiet=pipeline_options.quiet,
            )

            trainer = train.retrieve_trainer(
                ModelClass,
                pipeline_options,
                model_options,
                vocabs,
                output_dir,
                device_id,
            )

            # Dataset iterators
            train_iter = build_bucket_iterator(
                train_fold,
                batch_size=pipeline_options.train_batch_size,
                is_train=True,
                device=device_id,
            )
            valid_iter = build_bucket_iterator(
                pred_fold,
                batch_size=pipeline_options.valid_batch_size,
                is_train=False,
                device=device_id,
            )

            trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs)

        # Predict
        predictor = load_model(trainer.checkpointer.best_model_path())
        train_predictions_i = predictor.run(
            pred_fold, batch_size=pipeline_options.valid_batch_size)

        dev_predictions_i = predictor.run(
            dev_set, batch_size=pipeline_options.valid_batch_size)

        test_predictions_i = None
        if test_set:
            test_predictions_i = predictor.run(
                test_set, batch_size=pipeline_options.valid_batch_size)

        torch.cuda.empty_cache()

        for output_name in train_predictions_i:
            train_predictions[output_name] += train_predictions_i[output_name]
            dev_predictions[output_name].append(dev_predictions_i[output_name])
            if test_set:
                test_predictions[output_name].append(
                    test_predictions_i[output_name])

    dev_predictions = average_all(dev_predictions)
    if test_set:
        test_predictions = average_all(test_predictions)

    save_predicted_probabilities(parent_dir,
                                 train_predictions,
                                 prefix=const.TRAIN)
    save_predicted_probabilities(parent_dir, dev_predictions, prefix=const.DEV)
    if test_set:
        save_predicted_probabilities(parent_dir,
                                     test_predictions,
                                     prefix=const.TEST)

    teardown(pipeline_options)

    return train_predictions
Ejemplo n.º 5
0
from flask import Flask, jsonify, request
import kiwi
from kiwi import constants as const
from flask_cors import CORS

# load model
model = kiwi.load_model('trained_models/estimator_en_de/estimator_en_de.torch')
# app
app = Flask(__name__)
CORS(app)


def make_color(text, color):
    if color == 'red':
        red_array.append(text)
    else:
        green_array.append(text)


def get_color(bad_prob, threshold):
    return 'green' if bad_prob < threshold else 'red'


# routes
@app.route('/', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    threshold = 0.7
    source = data['source']
    mt = data['mt']
    model_out = model.predict({