Example #1
0
def main():
    args = create_argparser().parse_args()
    base_props, props_to_evaluate = get_props_to_evaluate(args)
    dataset, unlabeled_docs = load_docs(args)
    props_picker, props_best_results = get_best_model_picker(
        history_improvement_controller(len(props_to_evaluate)))

    if not props_to_evaluate:
        print("No props found")
        return

    if os.path.exists(args.out_dir) and (not os.path.isdir(args.out_dir)
                                         or os.listdir(args.out_dir)):
        print("Output path should either not exists or be empty directory")
        return

    for props_idx, props in enumerate(props_to_evaluate):
        cur_props_path = join(args.out_dir, f'props_{props_idx}')
        os.makedirs(cur_props_path, exist_ok=True)
        dump_dict_as_json(props, join(cur_props_path, 'props.json'))

        with transformer_from_props(props.get("transformers", {})) as t:
            tr_dataset = dataset.transformed_by(t)
            tr_unlabeled_docs = [t.transform(doc) for doc in unlabeled_docs
                                 ] if unlabeled_docs is not None else None

        mean_main_score, mean_scores = splits_cycle(args.task_name, args.seeds,
                                                    props, props_idx,
                                                    tr_dataset,
                                                    tr_unlabeled_docs,
                                                    cur_props_path)
        props_picker(mean_main_score, mean_scores, lambda: None)

    best_props_idx = props_best_results.best_score_idx
    best_main_score, best_scores = props_best_results.best_scores
    print(
        f"Overall experiment best score: {best_main_score:.4f}, props: #{best_props_idx}"
    )

    best_props_path = join(args.out_dir, 'best_props')
    os.makedirs(best_props_path)

    for split_idx in range(dataset.splits_number):
        split_path = join(args.out_dir, f'props_{best_props_idx}',
                          f'split_{split_idx}')
        split_best_seed = join(split_path, 'best_seed')

        shutil.copytree(split_best_seed,
                        join(best_props_path, f'split_{split_idx}'))
        shutil.copy(join(split_path, 'mean_results.json'),
                    join(best_props_path, f'split_{split_idx}'))

    dump_dict_as_json({
        **best_scores, "props_num": best_props_idx
    }, join(args.out_dir, "best_results.json"))
    dump_dict_as_json(
        get_experiments_report(props_best_results, base_props,
                               props_to_evaluate),
        join(args.out_dir, "experiments_report.json"))
Example #2
0
def get_evaluating_hook(dev_docs: List[Document], train_docs: List[Document],
                        evaluate, base_path: str, early_stopping_rounds: int):

    stats_path = join(base_path, "best_model_stats")
    os.makedirs(stats_path, exist_ok=True)
    model_path = join(base_path, "best_model")
    os.makedirs(model_path, exist_ok=True)

    picker, results_storage = get_best_model_picker(
        history_improvement_controller(early_stopping_rounds))

    def save_clf_and_dump_stats(classifier, stats_generator):
        classifier.save(model_path)

        if stats_generator is not None:
            for i, doc in enumerate(dev_docs):
                with open(join(stats_path, doc.name + '_stats.txt'),
                          'w',
                          encoding='utf-8') as f:
                    f.write(stats_generator(i))

    def apply(classifier, epoch):
        print(f"Epoch {epoch}, dev results:")
        dev_main_score, dev_scores, stats_generator = evaluate(classifier,
                                                               dev_docs,
                                                               need_stats=True)
        print("Score={:.4f}".format(dev_main_score))
        print()

        stopped = picker(
            dev_main_score, dev_scores,
            lambda: save_clf_and_dump_stats(classifier, stats_generator))

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, train results:")
            train_main_score, train_scores, _ = evaluate(classifier,
                                                         train_docs,
                                                         need_stats=False)
            print("Score={:.4f}".format(train_main_score))
            print()
        else:
            train_scores = None

        scores = {'dev': dev_scores}
        if train_scores is not None:
            scores['train'] = train_scores
        with open(join(base_path, 'results.txt'), 'a', encoding='utf-8') as f:
            f.write(
                json.dumps({f"epoch_{epoch}": scores},
                           indent=4,
                           sort_keys=True))
            f.write('\n\n')

        return stopped

    return apply, results_storage