def create_active_learner(self):
        torch.manual_seed(self.config.RANDOM_STATE)

        vocab = Vocabulary.from_instances(self.train_dataset_for_voc.instances)

        bert_creator = LibActBertCreator(
            # idx2tag=self.idx2tag,
            # tag2idx=self.tag2idx,
            tokenizer_name=self.config.BERT_MODEL_TYPE,
            bert_model_type=self.config.BERT_MODEL_TYPE,
            cache_dir=self.config.CACHE_DIR,
            n_epochs=self.config.N_EPOCHS,
            lr=self.config.LEARNING_RATE,
            bs=self.config.BATCH_SIZE,
            ebs=self.config.PRED_BATCH_SIZE,
            patience=self.config.PATIENCE,
            additional_X=self._additional_X,
            additional_y=self._additional_y,
            vocab=vocab,
            reader=self.reader,
            bs_pred=self.config.PRED_BATCH_SIZE)

        active_learn_alg_ctor = make_libact_strategy_ctor(
            lambda trn_ds: StrategyMNLP(
                dataset=trn_ds,
                model=bert_creator(valid_ratio=self.config.VALIDATION_RATIO,
                                   retrain_epochs=self.config.N_EPOCHS,
                                   autofill_similar_objects=True,
                                   n_upsample_positive=self.config.
                                   UPSAMPLE_POSITIVE)),
            max_samples_number=self.config.MAX_SAMPLES_NUMBER)

        if all([e is None for e in self.y_seed_dict]):
            rnd_start_steps = 1
        else:
            rnd_start_steps = 0
        # Creating ActiveLearning object that implements AL logic.
        self.active_learner = ActiveLearner(
            active_learn_alg_ctor=active_learn_alg_ctor,
            X_full_dataset=self.X_helper.texts.tolist(),
            y_full_dataset=self.y_seed_dict,
            rnd_start_steps=rnd_start_steps)

        self.active_learner.start()
Exemple #2
0
def run_experiment(data_path,
                   models_path,
                   ranking_strategy,
                   n_al_iterations,
                   emb_name,
                   max_samples_number,
                   n_seeds_random,
                   percent=True,
                   batch_size=64,
                   batch_size_pred=64,
                   max_epochs=20,
                   learning_rate=0.1,
                   repeat=1):

    models_path = os.path.join(models_path, emb_name, ranking_strategy)
    os.makedirs(models_path, exist_ok=True)

    strat = strategies_to_try(ranking_strategy)
    emb = get_embeddings(emb_name)

    print(f'Opened statistics{repeat}.json')
    if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
        print(f'statistics{repeat}.json already exists. Next')
        return

    LOGGER.info(f'Strategy {ranking_strategy} is running')

    X_train, X_test, y_train, y_test, tag_dictionary = load_task(data_path)
    y_seed = y_train2y_seed(y_train, n_seeds_per_class=n_seeds_random)

    if percent:
        selector = [False for _ in range(len(y_seed))]
        for ind, answ in enumerate(y_seed):
            if answ is None:
                selector[ind] = False
            elif all(e is None for e in y_seed):
                selector[ind] = False
            else:
                selector[ind] = True

        y_nonempty = np.array(y_seed)[selector]
        max_samples_number = int((len(y_seed) - len(y_nonempty)) * 0.02)

    tagger = SequenceTagger(hidden_size=128,
                            embeddings=emb(),
                            tag_dictionary=tag_dictionary,
                            tag_type='ner',
                            use_crf=True)

    active_tagger = SequenceTaggerActiveStudent(
        tagger,
        base_path=models_path,
        reset_model_before_train=True,
        mini_batch_size=batch_size,
        eval_mini_batch_size=batch_size_pred,
        checkpoint=False,
        learning_rate=0.1,
        index_subset=False,
        save_all_models=False,
        save_final_model=False,
        anneal_with_restarts=True,
        max_epochs=max_epochs)

    active_learn_alg_ctor = make_libact_strategy_ctor(
        lambda tr_ds: strat(tr_ds, active_tagger),
        max_samples_number=max_samples_number)

    LOGGER.info('Active learning...')

    active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                                   y_dtype='str',
                                   X_full_dataset=X_train,
                                   y_full_dataset=y_seed,
                                   X_test_dataset=X_test,
                                   y_test_dataset=y_test,
                                   model_evaluate=active_tagger,
                                   eval_metrics=[f1_score],
                                   rnd_start_steps=0,
                                   rnd_start_samples=max_samples_number)

    statistics = emulate_active_learning(y_train,
                                         active_learner,
                                         models_path,
                                         f'statistics{repeat}.json',
                                         max_iterations=n_al_iterations)
    dump_file(statistics, models_path, f'statistics{repeat}.json')

    print(f'Experiment {repeat} ended')
Exemple #3
0
def run_experiment(config):
    print('Active learning strategy:', config.al.strat_name)

    print('Loading task...', config.data.task)
    preprocess = (config.model.model_type == 'crf')
    print(config.data.data_folder)
    X_train, X_test, y_train, y_test, tag_dictionary = load_task(config.data.data_folder, 
                                                                 config.data.task, 
                                                                 config.data.tag_column,
                                                                 preprocess)
    print('Done.')

    strat = strategies_to_try(config.al.strat_name)
    model_name = config.model.model_type

    if config.al.percent:
        percent = 0.02
        print('FULL:', len(y_train))
        y_seed = y_train2y_seed_percent(y_train, percent)
        selector = [False for _ in range(len(y_seed))]
        for ind, answ in enumerate(y_seed):
            if answ is None:
                selector[ind] = False
            elif all(e is None for e in y_seed):
                selector[ind] = False
            else:
                selector[ind] = True

        y_nonempty = np.array(y_seed)[selector]
        print('2PERCENT:', len(y_nonempty))
        max_samples_number = int(len(y_seed) * percent)
    else:
        y_seed = y_train2y_seed(y_train)
        max_samples_number = config.al.max_samples_number

    for repeat in range(config.n_repeats):
        print(f'######################==Repeat {repeat} ==#####################')

        strat = strategies_to_try(config.al.strat_name)

        model_name = config.model.model_type
        
        
        
        if config.al.percent:
            print('FULL:', len(y_train))
            y_seed = y_train2y_seed_percent(y_train)
            selector = [False for _ in range(len(y_seed))]
            for ind, answ in enumerate(y_seed):
                if answ is None:
                    selector[ind] = False
                elif all(e is None for e in y_seed):
                    selector[ind] = False
                else:
                    selector[ind] = True
                
            y_nonempty = np.array(y_seed)[selector]
            print('2PERCENT:', len(y_nonempty))
            max_samples_number = int(len(y_seed) * 0.02)
             
        else:
            y_seed = y_train2y_seed(y_train)
            max_samples_number = config.al.max_samples_number
            
        print('MAX_SAMPLES:', max_samples_number)

        if 'flair' in config.model.model_type:
            print(config.model.model_type)
            
            bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes'
            models_path = os.path.join(config.exp_path, f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}')
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            print('Embeddings', config.model.emb_name)
            emb = get_embeddings(config.model.emb_name)

            tagger = SequenceTagger(hidden_size=config.model.hidden_size,
                                    embeddings=emb(),
                                    tag_dictionary=tag_dictionary,
                                    tag_type=config.data.task,
                                    use_crf=True)
            print(config.model.bayes)
            if config.model.bayes:
                print('BAYES CHOSEN')
                convert_to_mc_dropout(tagger, (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout), option='flair')
                active_tagger = LibActFlairBayes(tagger,
                                            base_path=models_path,
                                            reset_model_before_train=True,
                                            mini_batch_size=config.model.bs,
                                            eval_mini_batch_size=config.model.ebs,
                                            checkpoint=False,
                                            learning_rate=config.model.lr,
                                            index_subset=False,
                                            save_all_models=False,
                                            max_epochs=config.model.n_epochs,
                                            min_learning_rate=config.model.min_lr)
                
                print(active_tagger)
                
            else:
                active_tagger = LibActFlair(tagger,
                                            base_path=models_path,
                                            reset_model_before_train=True,
                                            mini_batch_size=config.model.bs,
                                            eval_mini_batch_size=config.model.ebs,
                                            checkpoint=False,
                                            learning_rate=config.model.lr,
                                            index_subset=False,
                                            save_all_models=False,
                                            max_epochs=config.model.n_epochs,
                                            min_learning_rate=config.model.min_lr)
            fit_model = False

        elif config.model.model_type == 'crf':
            models_path = os.path.join(config.exp_path, model_name)
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            active_tagger = LibActCrf(algorithm="lbfgs",
                                      c1=0.1,
                                      c2=0.1,
                                      max_iterations=100,
                                      all_possible_transitions=True)
            fit_model = True

        elif config.model.model_type == 'bert':

            if config.model.bayes:
                libactnn = LibActNNBayes
                bayes_type = config.model.bayes_type
            else:
                libactnn = LibActNN
                bayes_type = 'no_bayes'

            models_path = os.path.join(config.exp_path, f'{model_name}_{bayes_type}')
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue
            
            index2tag = ['[PAD]'] + tag_dictionary.get_items()
            tag2index = {e: i for i, e in enumerate(index2tag)}
            active_tagger = create_libact_adaptor_bert(tag2index, index2tag, libactnn, config.model, config.cache_dir)
            fit_model = False

        active_learn_alg_ctor = make_libact_strategy_ctor(lambda tr_ds: strat(
            tr_ds, active_tagger), max_samples_number=config.al.max_samples_number)

        active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor,
                                       y_dtype='str',
                                       X_full_dataset=X_train,
                                       y_full_dataset=y_seed,
                                       X_test_dataset=X_test,
                                       y_test_dataset=y_test,
                                       model_evaluate=active_tagger,
                                       eval_metrics=[f1_score],
                                       rnd_start_steps=0)

        statistics = emulate_active_learning(y_train, active_learner,
                                             max_iterations=config.al.n_iterations,
                                             fit_model=fit_model)
        dump_file(statistics, models_path, f'statistics{repeat}.json')