Python get_exact_match_metrics Exemples, src.eval.eval_tools.get_exact_match_metrics Python Exemples

Exemple #1

0

Afficher le fichier

def ensemble():
    dataset = data_loader.load_processed_data(args)
    split = 'test' if args.test else 'dev'
    dev_examples = dataset[split]
    print('{} dev examples loaded'.format(len(dev_examples)))
    if args.dataset_name == 'wikisql':
        engine_path = os.path.join(args.data_dir, '{}.db'.format(split))
        engine = DBEngine(engine_path)
    else:
        engine = None

    sps = [EncoderDecoderLFramework(args) for _ in ensemble_model_dirs]
    for i, model_dir in enumerate(ensemble_model_dirs):
        checkpoint_path = os.path.join(model_dir, 'model-best.16.tar')
        sps[i].schema_graphs = dataset['schema']
        sps[i].load_checkpoint(checkpoint_path)
        sps[i].cuda()
        sps[i].eval()

    pred_restored_cache = sps[0].load_pred_restored_cache()
    pred_restored_cache_size = sum(len(v)
                                   for v in pred_restored_cache.values())

    out_dict = sps[0].inference(dev_examples, restore_clause_order=args.process_sql_in_execution_order,
                                pred_restored_cache=pred_restored_cache,
                                check_schema_consistency_=args.sql_consistency_check, engine=engine,
                                inline_eval=True, model_ensemble=[sp.mdl for sp in sps], verbose=True)

    if args.process_sql_in_execution_order:
        new_pred_restored_cache_size = sum(
            len(v) for v in out_dict['pred_restored_cache'].values())
        newly_cached_size = new_pred_restored_cache_size - pred_restored_cache_size
        if newly_cached_size > 0:
            sps[0].save_pred_restored_cache(
                out_dict['pred_restored_cache'], newly_cached_size)

    out_txt = os.path.join(sps[0].model_dir, 'predictions.ens.{}.{}.{}.{}.txt'.format(
        args.beam_size, args.bs_alpha, split, len(ensemble_model_dirs)))
    with open(out_txt, 'w') as o_f:
        assert(len(dev_examples) == len(out_dict['pred_decoded']))
        for i, pred_sql in enumerate(out_dict['pred_decoded']):
            if args.dataset_name == 'wikisql':
                example = dev_examples[i]
                o_f.write('{}\n'.format(json.dumps(
                    {'sql': pred_sql[0], 'table_id': example.db_name})))
            else:
                o_f.write('{}\n'.format(pred_sql[0]))
        print('Model predictions saved to {}'.format(out_txt))

    print('{} set performance'.format(split.upper()))
    metrics = eval_tools.get_exact_match_metrics(
        dev_examples, out_dict['pred_decoded'], engine=engine)
    print('Top-1 exact match: {:.3f}'.format(metrics['top_1_em']))
    print('Top-2 exact match: {:.3f}'.format(metrics['top_2_em']))
    print('Top-3 exact match: {:.3f}'.format(metrics['top_3_em']))
    print('Top-5 exact match: {:.3f}'.format(metrics['top_5_em']))
    print('Top-10 exact match: {:.3f}'.format(metrics['top_10_em']))

Exemple #2

0

Afficher le fichier

Fichier : experiments.py Projet : todpole3/TabularSemanticParsing

 def evaluate(examples, out_dict):
     metrics = eval_tools.get_exact_match_metrics(examples,
                                                  out_dict['pred_decoded'],
                                                  engine=engine)
     print('Top-1 exact match: {:.3f}'.format(metrics['top_1_em']))
     print('Top-2 exact match: {:.3f}'.format(metrics['top_2_em']))
     print('Top-3 exact match: {:.3f}'.format(metrics['top_3_em']))
     print('Top-5 exact match: {:.3f}'.format(metrics['top_5_em']))
     print('Top-10 exact match: {:.3f}'.format(metrics['top_10_em']))
     if args.dataset_name == 'wikisql':
         print('Top-1 exe match: {:.3f}'.format(metrics['top_1_ex']))
         print('Top-2 exe match: {:.3f}'.format(metrics['top_2_ex']))
         print('Top-3 exe match: {:.3f}'.format(metrics['top_3_ex']))
         print('Top-5 exe match: {:.3f}'.format(metrics['top_5_ex']))
         print('Top-10 exet match: {:.3f}'.format(metrics['top_10_ex']))
     print('Table error: {:.3f}'.format(metrics['table_err']))

Exemple #3

0

Afficher le fichier

 def evaluate(examples, out_dict):
     metrics = eval_tools.get_exact_match_metrics(
         examples, out_dict['pred_decoded'], engine=engine)
     print('Top-1 exact match: {:.3f}'.format(metrics['top_1_em']))
     print('Top-2 exact match: {:.3f}'.format(metrics['top_2_em']))
     print('Top-3 exact match: {:.3f}'.format(metrics['top_3_em']))
     print('Top-5 exact match: {:.3f}'.format(metrics['top_5_em']))
     print('Top-10 exact match: {:.3f}'.format(metrics['top_10_em']))
     if args.dataset_name == 'wikisql':
         print('Top-1 exe match: {:.3f}'.format(metrics['top_1_ex']))
         print('Top-2 exe match: {:.3f}'.format(metrics['top_2_ex']))
         print('Top-3 exe match: {:.3f}'.format(metrics['top_3_ex']))
         print('Top-5 exe match: {:.3f}'.format(metrics['top_5_ex']))
         print('Top-10 exet match: {:.3f}'.format(metrics['top_10_ex']))
     print('Table error: {:.3f}'.format(metrics['table_err']))
     performance = os.path.join(sp.model_dir, f"test_performance_{args.data_dir.split('/')[1]}_{args.beam_size}.txt")
     metric_keys = ['top_1_em', 'top_2_em', 'top_3_em', 'top_5_em', 'top_10_em', 'top_1_ex', 'top_2_ex', 
     'top_3_ex', 'top_5_ex', 'top_10_ex', 'table_err']
     with open(performance, 'w') as pf:
         for key in metric_keys:
             pf.write(f'{key}: {metrics[key]:.3f}\n')

Exemple #4

0

Afficher le fichier

    def run_train(self, train_data, dev_data):
        self.print_model_parameters()

        import wandb
        wandb.init(project='smore-{}-group-{}-final'.format(
            self.args.dataset_name,
            get_no_join_tag(self.args, separator_in_front=True)),
                   group=get_wandb_group(self.args),
                   name=get_wandb_tag(self.args))
        os.environ["WANDB_RUN_GROUP"] = get_wandb_group(self.args)
        wandb.watch(self)

        if self.args.augment_with_wikisql:
            train_data_, train_data_augment = [], []
            for example in train_data:
                if example.dataset_id == WIKISQL:
                    train_data_augment.append(example)
                else:
                    train_data_.append(example)
            train_data = train_data_
            train_batch_size = round(self.train_batch_size * 0.7)
            train_augment_batch_size = self.train_batch_size - train_batch_size

            dev_data_, dev_data_augment = [], []
            for example in dev_data:
                if example.dataset_id == WIKISQL:
                    dev_data_augment.append(example)
                else:
                    dev_data_.append(example)
                dev_data = dev_data_
            print('**************************')
            print('{} training examples'.format(len(train_data)))
            print('{} augmented training examples'.format(
                len(train_data_augment)))
            print('train batch size = {}'.format(train_batch_size))
            print('train augment batch size = {}'.format(
                train_augment_batch_size))
            print('{} dev examples'.format(len(dev_data)))
            print('{} augmented dev examples'.format(len(dev_data_augment)))
            print('**************************')
        else:
            train_batch_size = self.train_batch_size
            train_augment_batch_size = 0

        # Track training losses dev metrics changes
        ############################
        epoch_losses = []
        best_dev_metrics = 0
        dev_metrics_history = []
        ############################

        all_train_data = copy.deepcopy(train_data)
        # Curriculum learning (start from easy category)
        if self.args.curriculum_interval > 0:
            # assert(self.args.curriculum_interval % self.args.num_peek_steps == 0)
            train_data = [
                exp for exp in all_train_data
                if exp.hardness in ['easy', 'medium']
            ]
            print('Curriculumn: [easy, medium] ({}) ------'.format(
                len(train_data)))

        num_steps = self.num_steps * self.num_accumulation_steps
        num_peek_steps = self.num_peek_steps * self.num_accumulation_steps
        curriculum_interval = self.args.curriculum_interval * self.num_accumulation_steps

        random.shuffle(train_data)
        if self.args.augment_with_wikisql:
            random.shuffle(train_data_augment)
            augment_example_id = 0
        step_id, example_id = 0, 0

        self.optim.zero_grad()
        self.train()

        for interval_step_id in range(self.start_step, num_steps,
                                      num_peek_steps):
            # Update model parameters
            self.train()

            for s_id in tqdm(range(num_peek_steps)):
                step_id = interval_step_id + s_id
                if self.log_in_wandb(step_id / self.num_accumulation_steps):
                    wandb.log({
                        'learning_rate/{}'.format(self.dataset):
                        self.optim.param_groups[0]['lr']
                    })
                    wandb.log({
                        'fine_tuning_rate/{}'.format(self.dataset):
                        self.optim.param_groups[1]['lr']
                    })

                batch_end = example_id + train_batch_size
                if curriculum_interval > 0 and step_id % curriculum_interval == 0 and \
                        0 < step_id / curriculum_interval <= 2:
                    if float(step_id) / curriculum_interval == 1:
                        train_data = [
                            exp for exp in all_train_data
                            if exp.hardness in ['easy', 'medium', 'hard']
                        ]
                        print('Curriculumn: [easy, medium, hard] ({}) ------'.
                              format(len(train_data)))
                    elif float(step_id) / curriculum_interval == 2:
                        train_data = all_train_data
                        print(
                            'Curriculumn: [easy, medium, hard, extra] ({}) ------'
                            .format(len(train_data)))
                    random.shuffle(train_data)
                    example_id, batch_end = 0, train_batch_size
                if batch_end > len(train_data):
                    random.shuffle(train_data)
                    example_id, batch_end = 0, train_batch_size
                mini_batch = train_data[example_id:batch_end]
                example_id = batch_end
                if self.args.augment_with_wikisql:
                    augment_batch_end = augment_example_id + train_augment_batch_size
                    if augment_batch_end > len(train_data_augment):
                        random.shuffle(train_data_augment)
                        augment_example_id, augment_batch_end = 0, train_augment_batch_size
                    mini_batch += train_data_augment[
                        augment_example_id:augment_batch_end]
                    augment_example_id = augment_batch_end

                formatted_batch = self.format_batch(mini_batch)
                loss = self.loss(formatted_batch)
                loss.backward()
                epoch_losses.append(float(loss) * self.num_accumulation_steps)

                if (step_id + 1) % self.num_accumulation_steps == 0:
                    # Gradient clipping
                    if self.grad_norm > 0:
                        nn.utils.clip_grad_norm_(self.parameters(),
                                                 self.grad_norm)
                    # Update learning rate scheduler
                    self.lr_scheduler.step()
                    # Update parameters
                    self.optim.step()
                    self.optim.zero_grad()

            # Check training statistics
            if step_id > 0 and (step_id + 1) % num_peek_steps == 0:
                stdout_msg = 'Step {}: average training loss = {}'.format(
                    step_id / self.num_accumulation_steps,
                    np.mean(epoch_losses))
                print(stdout_msg)
                wandb.log({
                    'cross_entropy_loss/{}'.format(self.dataset):
                    np.mean(epoch_losses)
                })
                epoch_losses = []

            # Check model performance
            if step_id > 0 and (step_id + 1) % num_peek_steps == 0:
                self.eval()
                if self.args.process_sql_in_execution_order:
                    pred_restored_cache = self.load_pred_restored_cache()
                    pred_restored_cache_size = sum(
                        len(v) for v in pred_restored_cache.values())
                else:
                    pred_restored_cache = None
                engine_path = os.path.join(
                    self.args.data_dir,
                    'dev.db') if self.args.dataset_name == 'wikisql' else None
                engine = DBEngine(engine_path) if engine_path else None

                output_dict = self.inference(
                    dev_data,
                    restore_clause_order=self.args.
                    process_sql_in_execution_order,
                    pred_restored_cache=pred_restored_cache,
                    check_schema_consistency_=self.args.sql_consistency_check,
                    engine=engine,
                    inline_eval=True,
                    verbose=False)
                metrics = eval_tools.get_exact_match_metrics(
                    dev_data, output_dict['pred_decoded'], engine=engine)
                dev_metrics_history.append(metrics)

                eval_metrics = metrics[
                    'top_1_ex'] if self.args.dataset_name == 'wikisql' else metrics[
                        'top_1_em']
                wandb.log(
                    {'dev_exact_match/{}'.format(self.dataset): eval_metrics})

                print('Dev set performance:')
                print('Top-1 exact match: {}'.format(metrics['top_1_em']))
                print('Top-3 exact match: {}'.format(metrics['top_3_em']))
                if self.args.dataset_name == 'wikisql':
                    print('Top-1 exe acc: {}'.format(metrics['top_1_ex']))
                    print('Top-3 exe acc: {}'.format(metrics['top_3_ex']))

                if eval_metrics >= best_dev_metrics:
                    best_dev_metrics = eval_metrics
                    self.save_checkpoint(step_id,
                                         step_id / num_peek_steps,
                                         output_dict['pred_decoded'],
                                         is_best=True)
                if self.args.augment_with_wikisql and (step_id + 1) % (
                        num_peek_steps * 3) == 0:
                    wikisql_output_dict = self.inference(dev_data_augment,
                                                         inline_eval=True,
                                                         verbose=False)
                    wikisql_metrics = eval_tools.get_exact_match_metrics(
                        dev_data_augment, wikisql_output_dict['pred_decoded'])
                    wandb.log({
                        'wikisql_dev_exact_match/{}'.format(self.dataset):
                        wikisql_metrics['top_1_em']
                    })
                    print('WikiSQL dev set performance:')
                    print('Top-1 exact match: {}'.format(
                        wikisql_metrics['top_1_em']))
                    print('Top-3 exact match: {}'.format(
                        wikisql_metrics['top_3_em']))
                if self.args.process_sql_in_execution_order:
                    new_pred_restored_cache_size = sum(
                        len(v)
                        for v in output_dict['pred_restored_cache'].values())
                    newly_cached_size = new_pred_restored_cache_size - pred_restored_cache_size
                    if newly_cached_size > 0:
                        self.save_pred_restored_cache(
                            output_dict['pred_restored_cache'],
                            newly_cached_size)