def test_api(temp_output_dir, train_opts, nuqe_opts, atol): from kiwi import train, load_model train_opts.model = 'nuqe' train_opts.checkpoint_keep_only_best = 1 all_opts = merge_namespaces(train_opts, nuqe_opts) config_file = Path(temp_output_dir, 'config.yaml') save_config_file(all_opts, config_file) train_run_info = train(config_file) predicter = load_model(train_run_info.model_path) examples = { constants.SOURCE: open(nuqe_opts.test_source).readlines(), constants.TARGET: open(nuqe_opts.test_target).readlines(), constants.ALIGNMENTS: open(nuqe_opts.test_alignments).readlines(), } predictions = predicter.predict(examples, batch_size=train_opts.batch_size) predictions = predictions[constants.TARGET_TAGS] avg_of_avgs = np.mean(list(map(np.mean, predictions))) max_prob = max(map(max, predictions)) min_prob = min(map(min, predictions)) np.testing.assert_allclose(avg_of_avgs, 0.572441, atol=atol) assert 0 <= min_prob <= avg_of_avgs <= max_prob <= 1
def inference(self, options, call=[]): """ Returns a list of predictions. A list of lists, number for each word in sentence.""" data = options['dataset'].blind.data kiwi_config = kb.load_kiwi_config(options['quetch_test']) pred = kiwi.load_model(kiwi_config['load-model']) src = [' '.join(sent.src) for sent in data] tgt = [' '.join(sent.tgt) for sent in data] alg = [ ' '.join((f'{alg[0]}-{alg[1]}' for alg in sent.alignment)) for sent in data ] examples = { kiwi.constants.SOURCE: src, kiwi.constants.TARGET: tgt, kiwi.constants.ALIGNMENTS: alg } print("Doing some inference on my blind data of size:", len(options['dataset'].blind.data)) predictions = pred.predict(examples) return predictions['tags']
def setup(self, parameters): #self.params={} self.translate_params['train'] = TRAIN_DEFAULTS self.translate_params['model'] = {} self.translate_params['train']['seed'] = int(parameters['seed']) self.translate_params['train']['model_type'] = parameters['model_type'] self.translate_params['train']['patience'] = int( parameters['patience']) self.translate_params['train']['max_epochs'] = int( parameters['max_epochs']) #self.translate_params['train']['eval_freq']=int(parameters['eval_freq']) # NOTE: Disable validation during finetuning self.translate_params['train']['eval_freq'] = -1 #self.translate_params['train']['eval_metrics']=parameters['eval_metrics'] #self.translate_params['train']['eval_metrics']=None #self.translate_params['train']['eval_filters']=parameters['eval_filters'] #self.translate_params['train']['eval_filters']=None #self.translate_params['train']['eval_beam']=int(parameters['eval_beam']) #self.translate_params['train']['eval_beam']=None #self.translate_params['train']['eval_batch_size']=int(parameters['eval_batch_size']) #self.translate_params['train']['eval_batch_size']=None self.translate_params['train']['save_best_metrics'] = parameters[ 'save_best_metrics'] self.translate_params['train']['eval_max_len'] = int( parameters['eval_max_len']) self.translate_params['train']['checkpoint_freq'] = int( parameters['checkpoint_freq']) #self.translate_params['train']['n_checkpoints']=parameters['n_checkpoints'] self.translate_params['train']['l2_reg'] = int(parameters['l2_reg']) self.translate_params['train']['lr_decay'] = parameters['lr_decay'] self.translate_params['train']['lr_decay_revert'] = parameters[ 'lr_decay_revert'] self.translate_params['train']['lr_decay_factor'] = parameters[ 'lr_decay_factor'] self.translate_params['train']['lr_decay_patience'] = int( parameters['lr_decay_patience']) self.translate_params['train']['gclip'] = int(parameters['gclip']) self.translate_params['train']['optimizer'] = parameters['optimizer'] self.translate_params['train']['lr'] = parameters['lr'] self.translate_params['train']['batch_size'] = int( parameters['batch_size']) self.translate_params['train']['save_optim_state'] = False self.translate_params['train']['save_path'] = Path( "/not/used/because/beat_platform") #self.translate_params['train']['tensorboard_dir']="/lium/users/barrault/llmt/tensorboard" self.translate_params['model']['att_type'] = parameters['att_type'] self.translate_params['model']['att_bottleneck'] = parameters[ 'att_bottleneck'] self.translate_params['model']['enc_dim'] = int(parameters['enc_dim']) self.translate_params['model']['dec_dim'] = int(parameters['dec_dim']) self.translate_params['model']['emb_dim'] = int(parameters['emb_dim']) self.translate_params['model']['dropout_emb'] = parameters[ 'dropout_emb'] self.translate_params['model']['dropout_ctx'] = parameters[ 'dropout_ctx'] self.translate_params['model']['dropout_out'] = parameters[ 'dropout_out'] self.translate_params['model']['n_encoders'] = int( parameters['n_encoders']) self.translate_params['model']['tied_emb'] = parameters['tied_emb'] self.translate_params['model']['dec_init'] = parameters['dec_init'] self.translate_params['model']['bucket_by'] = "src" if parameters['max_len'] == "None": self.translate_params['model']['max_len'] = None else: self.translate_params['model']['max_len'] = int( parameters['max_len']) self.translate_params['model']['direction'] = "src:Text -> trg:Text" self.adapted_translate_params = copy.deepcopy(self.translate_params) self.qe_model = None if parameters['direction'][11:13] == 'de': # load EN-DE model kiwi_path = os.path.abspath( os.path.join( os.pardir, 'openkiwi/trained_models/estimator_en_de.torch/estimator_en_de.torch' )) self.qe_model = kiwi.load_model(kiwi_path) return True
def run(ModelClass, output_dir, pipeline_options, model_options, splits): model_name = getattr(ModelClass, 'title', ModelClass.__name__) logger.info('Jackknifing with the {} model'.format(model_name)) # Data fieldset = ModelClass.fieldset( wmt18_format=model_options.__dict__.get('wmt18_format')) train_set, dev_set = train.retrieve_datasets(fieldset, pipeline_options, model_options, output_dir) test_set = None try: test_set = build_test_dataset(fieldset, **vars(pipeline_options)) except ValueError: pass except FileNotFoundError: pass device_id = None if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0: device_id = pipeline_options.gpu_id parent_dir = output_dir train_predictions = defaultdict(list) dev_predictions = defaultdict(list) test_predictions = defaultdict(list) splitted_datasets = cross_split_dataset(train_set, splits) for i, (train_fold, pred_fold) in enumerate(splitted_datasets): run_name = 'train_split_{}'.format(i) output_dir = Path(parent_dir, run_name) output_dir.mkdir(parents=True, exist_ok=True) # options.output_dir = str(options.output_dir) # Train vocabs = utils.fields_to_vocabs(train_fold.fields) tracking_run = tracking_logger.start_nested_run(run_name=run_name) with tracking_run: train.setup( output_dir=output_dir, seed=pipeline_options.seed, gpu_id=pipeline_options.gpu_id, debug=pipeline_options.debug, quiet=pipeline_options.quiet, ) trainer = train.retrieve_trainer( ModelClass, pipeline_options, model_options, vocabs, output_dir, device_id, ) # Dataset iterators train_iter = build_bucket_iterator( train_fold, batch_size=pipeline_options.train_batch_size, is_train=True, device=device_id, ) valid_iter = build_bucket_iterator( pred_fold, batch_size=pipeline_options.valid_batch_size, is_train=False, device=device_id, ) trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs) # Predict predictor = load_model(trainer.checkpointer.best_model_path()) train_predictions_i = predictor.run( pred_fold, batch_size=pipeline_options.valid_batch_size) dev_predictions_i = predictor.run( dev_set, batch_size=pipeline_options.valid_batch_size) test_predictions_i = None if test_set: test_predictions_i = predictor.run( test_set, batch_size=pipeline_options.valid_batch_size) torch.cuda.empty_cache() for output_name in train_predictions_i: train_predictions[output_name] += train_predictions_i[output_name] dev_predictions[output_name].append(dev_predictions_i[output_name]) if test_set: test_predictions[output_name].append( test_predictions_i[output_name]) dev_predictions = average_all(dev_predictions) if test_set: test_predictions = average_all(test_predictions) save_predicted_probabilities(parent_dir, train_predictions, prefix=const.TRAIN) save_predicted_probabilities(parent_dir, dev_predictions, prefix=const.DEV) if test_set: save_predicted_probabilities(parent_dir, test_predictions, prefix=const.TEST) teardown(pipeline_options) return train_predictions
from flask import Flask, jsonify, request import kiwi from kiwi import constants as const from flask_cors import CORS # load model model = kiwi.load_model('trained_models/estimator_en_de/estimator_en_de.torch') # app app = Flask(__name__) CORS(app) def make_color(text, color): if color == 'red': red_array.append(text) else: green_array.append(text) def get_color(bad_prob, threshold): return 'green' if bad_prob < threshold else 'red' # routes @app.route('/', methods=['POST']) def predict(): data = request.get_json(force=True) threshold = 0.7 source = data['source'] mt = data['mt'] model_out = model.predict({