def create_active_learner(self): torch.manual_seed(self.config.RANDOM_STATE) vocab = Vocabulary.from_instances(self.train_dataset_for_voc.instances) bert_creator = LibActBertCreator( # idx2tag=self.idx2tag, # tag2idx=self.tag2idx, tokenizer_name=self.config.BERT_MODEL_TYPE, bert_model_type=self.config.BERT_MODEL_TYPE, cache_dir=self.config.CACHE_DIR, n_epochs=self.config.N_EPOCHS, lr=self.config.LEARNING_RATE, bs=self.config.BATCH_SIZE, ebs=self.config.PRED_BATCH_SIZE, patience=self.config.PATIENCE, additional_X=self._additional_X, additional_y=self._additional_y, vocab=vocab, reader=self.reader, bs_pred=self.config.PRED_BATCH_SIZE) active_learn_alg_ctor = make_libact_strategy_ctor( lambda trn_ds: StrategyMNLP( dataset=trn_ds, model=bert_creator(valid_ratio=self.config.VALIDATION_RATIO, retrain_epochs=self.config.N_EPOCHS, autofill_similar_objects=True, n_upsample_positive=self.config. UPSAMPLE_POSITIVE)), max_samples_number=self.config.MAX_SAMPLES_NUMBER) if all([e is None for e in self.y_seed_dict]): rnd_start_steps = 1 else: rnd_start_steps = 0 # Creating ActiveLearning object that implements AL logic. self.active_learner = ActiveLearner( active_learn_alg_ctor=active_learn_alg_ctor, X_full_dataset=self.X_helper.texts.tolist(), y_full_dataset=self.y_seed_dict, rnd_start_steps=rnd_start_steps) self.active_learner.start()
def run_experiment(data_path, models_path, ranking_strategy, n_al_iterations, emb_name, max_samples_number, n_seeds_random, percent=True, batch_size=64, batch_size_pred=64, max_epochs=20, learning_rate=0.1, repeat=1): models_path = os.path.join(models_path, emb_name, ranking_strategy) os.makedirs(models_path, exist_ok=True) strat = strategies_to_try(ranking_strategy) emb = get_embeddings(emb_name) print(f'Opened statistics{repeat}.json') if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') return LOGGER.info(f'Strategy {ranking_strategy} is running') X_train, X_test, y_train, y_test, tag_dictionary = load_task(data_path) y_seed = y_train2y_seed(y_train, n_seeds_per_class=n_seeds_random) if percent: selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] max_samples_number = int((len(y_seed) - len(y_nonempty)) * 0.02) tagger = SequenceTagger(hidden_size=128, embeddings=emb(), tag_dictionary=tag_dictionary, tag_type='ner', use_crf=True) active_tagger = SequenceTaggerActiveStudent( tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=batch_size, eval_mini_batch_size=batch_size_pred, checkpoint=False, learning_rate=0.1, index_subset=False, save_all_models=False, save_final_model=False, anneal_with_restarts=True, max_epochs=max_epochs) active_learn_alg_ctor = make_libact_strategy_ctor( lambda tr_ds: strat(tr_ds, active_tagger), max_samples_number=max_samples_number) LOGGER.info('Active learning...') active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor, y_dtype='str', X_full_dataset=X_train, y_full_dataset=y_seed, X_test_dataset=X_test, y_test_dataset=y_test, model_evaluate=active_tagger, eval_metrics=[f1_score], rnd_start_steps=0, rnd_start_samples=max_samples_number) statistics = emulate_active_learning(y_train, active_learner, models_path, f'statistics{repeat}.json', max_iterations=n_al_iterations) dump_file(statistics, models_path, f'statistics{repeat}.json') print(f'Experiment {repeat} ended')
def run_experiment(config): print('Active learning strategy:', config.al.strat_name) print('Loading task...', config.data.task) preprocess = (config.model.model_type == 'crf') print(config.data.data_folder) X_train, X_test, y_train, y_test, tag_dictionary = load_task(config.data.data_folder, config.data.task, config.data.tag_column, preprocess) print('Done.') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type if config.al.percent: percent = 0.02 print('FULL:', len(y_train)) y_seed = y_train2y_seed_percent(y_train, percent) selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] print('2PERCENT:', len(y_nonempty)) max_samples_number = int(len(y_seed) * percent) else: y_seed = y_train2y_seed(y_train) max_samples_number = config.al.max_samples_number for repeat in range(config.n_repeats): print(f'######################==Repeat {repeat} ==#####################') strat = strategies_to_try(config.al.strat_name) model_name = config.model.model_type if config.al.percent: print('FULL:', len(y_train)) y_seed = y_train2y_seed_percent(y_train) selector = [False for _ in range(len(y_seed))] for ind, answ in enumerate(y_seed): if answ is None: selector[ind] = False elif all(e is None for e in y_seed): selector[ind] = False else: selector[ind] = True y_nonempty = np.array(y_seed)[selector] print('2PERCENT:', len(y_nonempty)) max_samples_number = int(len(y_seed) * 0.02) else: y_seed = y_train2y_seed(y_train) max_samples_number = config.al.max_samples_number print('MAX_SAMPLES:', max_samples_number) if 'flair' in config.model.model_type: print(config.model.model_type) bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes' models_path = os.path.join(config.exp_path, f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}') os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue print('Embeddings', config.model.emb_name) emb = get_embeddings(config.model.emb_name) tagger = SequenceTagger(hidden_size=config.model.hidden_size, embeddings=emb(), tag_dictionary=tag_dictionary, tag_type=config.data.task, use_crf=True) print(config.model.bayes) if config.model.bayes: print('BAYES CHOSEN') convert_to_mc_dropout(tagger, (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout), option='flair') active_tagger = LibActFlairBayes(tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) print(active_tagger) else: active_tagger = LibActFlair(tagger, base_path=models_path, reset_model_before_train=True, mini_batch_size=config.model.bs, eval_mini_batch_size=config.model.ebs, checkpoint=False, learning_rate=config.model.lr, index_subset=False, save_all_models=False, max_epochs=config.model.n_epochs, min_learning_rate=config.model.min_lr) fit_model = False elif config.model.model_type == 'crf': models_path = os.path.join(config.exp_path, model_name) os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue active_tagger = LibActCrf(algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) fit_model = True elif config.model.model_type == 'bert': if config.model.bayes: libactnn = LibActNNBayes bayes_type = config.model.bayes_type else: libactnn = LibActNN bayes_type = 'no_bayes' models_path = os.path.join(config.exp_path, f'{model_name}_{bayes_type}') os.makedirs(models_path, exist_ok=True) if os.path.exists(os.path.join(models_path, f'statistics{repeat}.json')): print(f'statistics{repeat}.json already exists. Next') continue index2tag = ['[PAD]'] + tag_dictionary.get_items() tag2index = {e: i for i, e in enumerate(index2tag)} active_tagger = create_libact_adaptor_bert(tag2index, index2tag, libactnn, config.model, config.cache_dir) fit_model = False active_learn_alg_ctor = make_libact_strategy_ctor(lambda tr_ds: strat( tr_ds, active_tagger), max_samples_number=config.al.max_samples_number) active_learner = ActiveLearner(active_learn_alg_ctor=active_learn_alg_ctor, y_dtype='str', X_full_dataset=X_train, y_full_dataset=y_seed, X_test_dataset=X_test, y_test_dataset=y_test, model_evaluate=active_tagger, eval_metrics=[f1_score], rnd_start_steps=0) statistics = emulate_active_learning(y_train, active_learner, max_iterations=config.al.n_iterations, fit_model=fit_model) dump_file(statistics, models_path, f'statistics{repeat}.json')