def main(args): task_seq = [ # acquire_method(sub_acquire_method): random(""), no-dete("DAL","BALD"), dete("coreset","entropy",...) # ../../datasets/answer_selection/YahooCQA/data/data-FD/ #evidence, diversity # BiLSTM CNN { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 0, "sample_method": "No-Deterministic+DALLL2Layer+0", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+DALLL2Layer+16", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 32, "sample_method": "No-Deterministic+DALLL2Layer+32", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 64, "sample_method": "No-Deterministic+DALLL2Layer+64", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 128, "sample_method": "No-Deterministic+DALLL2Layer+128", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 0, "sample_method": "No-Deterministic+randommm2Layer+0", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+randommm2Layer+16", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 32, "sample_method": "No-Deterministic+randommm2Layer+32", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 64, "sample_method": "No-Deterministic+randommm2Layer+64", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 128, "sample_method": "No-Deterministic+randommm2Layer+128", }, ] allMethods_results = [ ] #Record the performance results of each method during active learning for config in task_seq: print("-------------------{}-{}-------------------".format( config["group_name"], config["sample_method"])) ####################################### initial setting ########################################### data_path = config["data_path"] model_name = config[ "model_name"] if "model_name" in config else 'BiLSTM' num_acquisitions_round = config["num_acquisitions_round"] acquire_method = config["acquire_method"] sub_acquire_method = config["sub_acquire_method"] init_question_num = config[ "init_question_num"] if "init_question_num" in config else 160 # number of initial training samples acquire_question_num_per_round = config[ "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20 #Number of samples collected per round warm_start_random_seed = config[ "warm_start_random_seed"] # the random seed for selecting the initial training set sample_method = config["sample_method"] loader = Loader() print('model:', model_name) print('dataset:', data_path) print('acquisition method:', acquire_method, "+", sub_acquire_method) if not os.path.exists(args.result_path): os.makedirs(args.result_path) if not os.path.exists(os.path.join(args.result_path, model_name)): os.makedirs(os.path.join(args.result_path, model_name)) if not os.path.exists( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)): os.makedirs( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)) #### If the data is not compiled, compile; otherwise load directly if (os.path.exists(os.path.join(data_path, 'mappings.pkl')) and os.path.exists(os.path.join(data_path, 'train.pkl')) and os.path.exists(os.path.join(data_path, 'test.pkl'))): mappings = pkl.load( open(os.path.join(data_path, 'mappings.pkl'), 'rb')) train_data = pkl.load( open(os.path.join(data_path, 'train.pkl'), 'rb')) test_data = pkl.load( open(os.path.join(data_path, 'test.pkl'), 'rb')) else: train_data, test_data, mappings = loader.load_yahoo( data_path, args.pretrained_word_embedding, args.word_embedding_dim, args.answer_count) pkl.dump(mappings, open(os.path.join(data_path, 'mappings.pkl'), 'wb')) pkl.dump(train_data, open(os.path.join(data_path, 'train.pkl'), 'wb')) pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'), 'wb')) #word embedding word_to_id = mappings['word_to_id'] tag_to_id = mappings['tag_to_id'] word_embeds = mappings[ 'word_embeds'] if args.use_pretrained_word_embedding else None word_vocab_size = len(word_to_id) print( ' The total amount of training data:%d\n' % len( train_data ), # Total number of training samples (number of question answer pair) 'The total amount of val data:%d\n' % len(test_data)) acquisition_function = Acquisition(train_data, seed=warm_start_random_seed, answer_count=args.answer_count, cuda_device=args.device[0], batch_size=args.sampling_batch_size, submodular_k=config["submodular_k"]) checkpoint_path = os.path.join(args.result_path, 'active_checkpoint', config["group_name"], sample_method) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) method_result = [ ] # Record the performance results of each method during active learning ####################################### acquire data and retrain ########################################### for i in range(num_acquisitions_round): print("current round:{}".format(i)) #-------------------acquisition--------------------- if i == 0: #first round acq = init_question_num a_m = "random" m_p = "" else: acq = acquire_question_num_per_round a_m = acquire_method m_p = os.path.join(checkpoint_path, 'modelweights') acquisition_function.obtain_data( train_data, model_path=m_p, model_name=model_name, acquire_questions_num=acq, method=a_m, sub_method=sub_acquire_method, unsupervised_method=config["unsupervised_method"]) # -------------------prepare training data--------------------- ''' train_data format: { 'str_words_q': str_words_q, # question word segmentation 'str_words_a': str_words_a, # answer word segmentation 'words_q': words_q, # question word id 'words_a': words_a, # answer word id 'tag': tag, # sample tag id } ''' new_train_index = (acquisition_function.train_index).copy() sorted_train_index = list(new_train_index) sorted_train_index.sort() labeled_train_data = [train_data[i] for i in sorted_train_index] print("Labeled training samples: {}".format( len(acquisition_function.train_index))) # -------------------------------------train-------------------------------------- print('.............Recreate the model...................') if model_name == 'BiLSTM': model = BiLSTM(word_vocab_size, args.word_embedding_dim, args.word_hidden_dim, args.target_size, pretrained=word_embeds, with_sim_features=args.with_sim_feature, cuda_device=args.device[0]) if model_name == 'CNN': model = CNN(word_vocab_size, args.word_embedding_dim, args.word_out_channels, args.target_size, pretrained=word_embeds, cuda_device=args.device[0]) model.cuda(args.device[0]) trainer = Trainer(model, model_name, tag_to_id, answer_count=args.answer_count, cuda_device=args.device[0], sampling_number=args.sampling_number) test_performance = trainer.train_supervisedLearning( args.num_epochs, labeled_train_data, test_data, args.learning_rate, checkpoint_path=checkpoint_path, batch_size=args.batch_size) print('.' * 50) print("Test performance: {}".format(test_performance)) print('*' * 50) #--------------------------Send data for a visual web page------------------------------ max_performance = config[ "max_performance"] if "max_performance" in config else 0 if "group_name" in config: updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance) else: updateLineChart(str(test_performance), sample_method, max=max_performance) # method_result.append(test_performance) # # print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}" # .format(acquire_method, sub_acquire_method, warm_start_random_seed)) # print(method_result) # allMethods_results.append(method_result) shutil.rmtree(checkpoint_path)
def main(args): task_seq = [ # The config for a task: # acquire_method(sub_acquire_method): random(""), no-dete("DASL","DAL","BALD"), dete("coreset","entropy",...) { "model_name": "BiLSTM", "group_name": "[2.18-?]BiLSTM+FD+MRR+200+200", "max_performance": 0.80, "data_path": "data/YahooCQA/data-FD/", "acquire_method": "no-dete", "sub_acquire_method": "DASL", "num_acquisitions_round": 37, "init_question_num": 40, "acquire_question_num_per_round": 40, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+DASL2+seed16", }, ] allMethods_results = [ ] #Record the performance results of each method during active learning for config in task_seq: print("-----------------------{}-{}-----------------------".format( config["group_name"], config["sample_method"])) ####################################### initial setting ########################################### data_path = config[ "data_path"] if "data_path" in config else "data/YahooCQA/data-FD/" model_name = config["model_name"] if "model_name" in config else 'CNN' num_acquisitions_round = config["num_acquisitions_round"] acquire_method = config["acquire_method"] sub_acquire_method = config["sub_acquire_method"] init_question_num = config[ "init_question_num"] if "init_question_num" in config else 160 # number of initial training samples acquire_question_num_per_round = config[ "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20 #Number of samples collected per round warm_start_random_seed = config[ "warm_start_random_seed"] # the random seed for selecting the initial training set sample_method = config["sample_method"] loader = Loader() print('model:', model_name) print('dataset:', data_path) print('acquisition method:', acquire_method, "+", sub_acquire_method) if not os.path.exists(args.result_path): os.makedirs(args.result_path) if not os.path.exists(os.path.join(args.result_path, model_name)): os.makedirs(os.path.join(args.result_path, model_name)) if not os.path.exists( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)): os.makedirs( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)) #### If the data is not compiled, compile; otherwise load directly if (os.path.exists(os.path.join(data_path, 'mappings.pkl')) and os.path.exists(os.path.join(data_path, 'train.pkl')) and os.path.exists(os.path.join(data_path, 'val.pkl')) and os.path.exists(os.path.join(data_path, 'test.pkl'))): mappings = pkl.load( open(os.path.join(data_path, 'mappings.pkl'), 'rb')) train_data = pkl.load( open(os.path.join(data_path, 'train.pkl'), 'rb')) val_data = pkl.load(open(os.path.join(data_path, 'val.pkl'), 'rb')) test_data = pkl.load( open(os.path.join(data_path, 'test.pkl'), 'rb')) else: train_data, val_data, test_data, mappings = loader.load_yahoo( data_path, args.pretrained_word_embedding, args.word_embedding_dim, args.answer_count) pkl.dump(train_data, open(os.path.join(data_path, 'train.pkl'), 'wb')) pkl.dump(val_data, open(os.path.join(data_path, 'val.pkl'), 'wb')) pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'), 'wb')) pkl.dump(mappings, open(os.path.join(data_path, 'mappings.pkl'), 'wb')) #word embedding word_to_id = mappings['word_to_id'] tag_to_id = mappings['tag_to_id'] word_embeds = mappings[ 'word_embeds'] if args.use_pretrained_word_embedding else None word_vocab_size = len(word_to_id) total_sentences = len( train_data ) # Total number of training samples (number of question answer pair) print( 'After training data is loaded, the total amount of training data: %d' % total_sentences) acquisition_function = Acquisition(train_data, seed=warm_start_random_seed, answer_count=args.answer_count) method_result = [ ] # Record the performance results of each method during active learning ####################################### acquire data and retrain ########################################### for i in range(num_acquisitions_round): print("current round:{}".format(i)) #-------------------acquisition--------------------- if i == 0: #first round acq = init_question_num a_m = "random" m_p = "" acquisition_function.obtain_data(train_data, model_path="", acquire=init_question_num, method="random") else: acq = acquire_question_num_per_round a_m = acquire_method m_p = os.path.join(checkpoint_path, 'modelweights') acquisition_function.obtain_data(model_path=m_p, model_name=model_name, data=train_data, acquire=acq, method=a_m, sub_method=sub_acquire_method) # -------------------prepare training data--------------------- ''' train_data的每个元素格式: { 'str_words_q': str_words_q, # question word segmentation 'str_words_a': str_words_a, # answer word segmentation 'words_q': words_q, # question word id 'words_a': words_a, # answer word id 'tag': tag, # sample tag id } ''' new_train_index = (acquisition_function.train_index).copy() sorted_train_index = list(new_train_index) sorted_train_index.sort() labeled_train_data = [train_data[i] for i in sorted_train_index] active_train_data = dict() active_train_data['labeled_train_data'] = labeled_train_data active_train_data[ 'pseudo_train_data'] = acquisition_function.pseudo_train_data print("Labeled training samples: {}".format( len(acquisition_function.train_index))) print("Unlabeled sample remaining: {}".format( len(train_data) - len(acquisition_function.train_index))) # -------------------------------------train-------------------------------------- checkpoint_folder = os.path.join('active_checkpoint', acquire_method, "fixed") checkpoint_path = os.path.join(args.result_path, model_name, checkpoint_folder) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) print('.............Recreate the model...................') if model_name == 'BiLSTM': model = BiLSTM(word_vocab_size, args.word_embedding_dim, args.word_hidden_dim, args.target_size, pretrained=word_embeds, with_sim_features=args.with_sim_feature, double_embedding=args.double_embedding) if model_name == 'CNN': model = CNN(word_vocab_size, args.word_embedding_dim, args.word_out_channels, args.target_size, pretrained=word_embeds, double_embedding=args.double_embedding) model.cuda() trainer = Trainer(model, args.result_path, model_name, tag_to_id, answer_count=args.answer_count) if active_train_data['pseudo_train_data']: noActiveTrain = { "acquisition_function": acquisition_function, "model_path": m_p, "model_name": model_name, "train_data": train_data, "acquire": acq, "method": a_m, "sub_method": sub_acquire_method } test_performance = trainer.train_selfPacedLearning( noActiveTrain, args.num_epochs, active_train_data, val_data, test_data, args.mu, args.learning_rate, checkpoint_folder=checkpoint_folder, batch_size=args.batch_size) else: test_performance = trainer.train_supervisedLearning( args.num_epochs, active_train_data, val_data, test_data, args.learning_rate, checkpoint_folder=checkpoint_folder, batch_size=args.batch_size) print('*' * 50) print("Test performance: {}".format(test_performance)) print('-' * 80) #--------------------------Send data for a visual web page------------------------------ max_performance = config[ "max_performance"] if "max_performance" in config else 0 if "group_name" in config: updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance) else: updateLineChart(str(test_performance), sample_method, max=max_performance)