def __init__(self, word_vocab_size, word_embedding_dim, word_out_channels, output_size, dropout_p=0.5, pretrained=None, cuda_device=0): super(CNN, self).__init__() self.cuda_device = cuda_device self.word_vocab_size = word_vocab_size self.word_embedding_dim = word_embedding_dim self.word_out_channels = word_out_channels self.initializer = Initializer() self.loader = Loader() self.embedding = nn.Embedding(word_vocab_size, word_embedding_dim) if pretrained is not None: self.embedding.weight = nn.Parameter(torch.FloatTensor(pretrained)) #Q_CNN self.question_encoder = EncoderCNN(word_vocab_size, word_embedding_dim, word_out_channels) #A_CNN self.answer_encoder = EncoderCNN(word_vocab_size, word_embedding_dim, word_out_channels) self.interaction = nn.Parameter(torch.FloatTensor(word_out_channels, word_out_channels).uniform_(0, .1)) self.dropout = nn.Dropout(p=dropout_p) hidden_size = word_out_channels * 2 + 1 self.linear = nn.Linear(hidden_size, hidden_size//2) self.linear2 = nn.Linear(hidden_size//2, output_size) self.Tanh = nn.Tanh()
def __init__(self, word_vocab_size, word_embedding_dim, word_hidden_dim, output_size, pretrained=None, n_layers=1, bidirectional=True, dropout_p=0.5, with_sim_features=True, cuda_device=0): super(BiLSTM, self).__init__() self.cuda_device = cuda_device self.word_vocab_size = word_vocab_size self.word_embedding_dim = word_embedding_dim self.word_hidden_dim = word_hidden_dim self.initializer = Initializer() self.loader = Loader() self.with_sim_features = with_sim_features self.embedding = nn.Embedding(word_vocab_size, word_embedding_dim) if pretrained is not None: self.embedding.weight = nn.Parameter(torch.FloatTensor(pretrained)) #Q_LSTM self.question_encoder = EncoderRNN(word_vocab_size, word_embedding_dim, word_hidden_dim, n_layers=n_layers, bidirectional=bidirectional, cuda_device=self.cuda_device) #A_LSTM self.answer_encoder = EncoderRNN(word_vocab_size, word_embedding_dim, word_hidden_dim, n_layers=n_layers, bidirectional=bidirectional, cuda_device=self.cuda_device) hidden_size = 2 * (2 * n_layers * word_hidden_dim if bidirectional else n_layers * word_hidden_dim) if self.with_sim_features: word_out_dim = 2 * n_layers * word_hidden_dim if bidirectional else n_layers * word_hidden_dim self.interaction = nn.Parameter( torch.FloatTensor(word_out_dim, word_out_dim).uniform_(0, .1)) hidden_size += 1 self.dropout = nn.Dropout(p=dropout_p) self.linear = nn.Linear(hidden_size, hidden_size // 2) self.linear2 = nn.Linear(hidden_size // 2, output_size) self.Tanh = nn.Tanh()
def main(args): task_seq = [ # acquire_method(sub_acquire_method): random(""), no-dete("DAL","BALD"), dete("coreset","entropy",...) # ../../datasets/answer_selection/YahooCQA/data/data-FD/ #evidence, diversity # BiLSTM CNN { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 0, "sample_method": "No-Deterministic+DALLL2Layer+0", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+DALLL2Layer+16", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 32, "sample_method": "No-Deterministic+DALLL2Layer+32", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 64, "sample_method": "No-Deterministic+DALLL2Layer+64", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "no-dete", "sub_acquire_method": "DAL", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 128, "sample_method": "No-Deterministic+DALLL2Layer+128", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 0, "sample_method": "No-Deterministic+randommm2Layer+0", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+randommm2Layer+16", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 32, "sample_method": "No-Deterministic+randommm2Layer+32", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 64, "sample_method": "No-Deterministic+randommm2Layer+64", }, { "model_name": "BiLSTM", "group_name": "[tkde]BiLSTM+Pets+MRR+160+160", "max_performance": 0.80, "data_path": "../../datasets/answer_selection/YahooCQA/data/data-Pets/", "acquire_method": "random", "sub_acquire_method": "", "unsupervised_method": '', "submodular_k": 1.5, "num_acquisitions_round": 50, "init_question_num": 32, "acquire_question_num_per_round": 32, "warm_start_random_seed": 128, "sample_method": "No-Deterministic+randommm2Layer+128", }, ] allMethods_results = [ ] #Record the performance results of each method during active learning for config in task_seq: print("-------------------{}-{}-------------------".format( config["group_name"], config["sample_method"])) ####################################### initial setting ########################################### data_path = config["data_path"] model_name = config[ "model_name"] if "model_name" in config else 'BiLSTM' num_acquisitions_round = config["num_acquisitions_round"] acquire_method = config["acquire_method"] sub_acquire_method = config["sub_acquire_method"] init_question_num = config[ "init_question_num"] if "init_question_num" in config else 160 # number of initial training samples acquire_question_num_per_round = config[ "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20 #Number of samples collected per round warm_start_random_seed = config[ "warm_start_random_seed"] # the random seed for selecting the initial training set sample_method = config["sample_method"] loader = Loader() print('model:', model_name) print('dataset:', data_path) print('acquisition method:', acquire_method, "+", sub_acquire_method) if not os.path.exists(args.result_path): os.makedirs(args.result_path) if not os.path.exists(os.path.join(args.result_path, model_name)): os.makedirs(os.path.join(args.result_path, model_name)) if not os.path.exists( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)): os.makedirs( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)) #### If the data is not compiled, compile; otherwise load directly if (os.path.exists(os.path.join(data_path, 'mappings.pkl')) and os.path.exists(os.path.join(data_path, 'train.pkl')) and os.path.exists(os.path.join(data_path, 'test.pkl'))): mappings = pkl.load( open(os.path.join(data_path, 'mappings.pkl'), 'rb')) train_data = pkl.load( open(os.path.join(data_path, 'train.pkl'), 'rb')) test_data = pkl.load( open(os.path.join(data_path, 'test.pkl'), 'rb')) else: train_data, test_data, mappings = loader.load_yahoo( data_path, args.pretrained_word_embedding, args.word_embedding_dim, args.answer_count) pkl.dump(mappings, open(os.path.join(data_path, 'mappings.pkl'), 'wb')) pkl.dump(train_data, open(os.path.join(data_path, 'train.pkl'), 'wb')) pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'), 'wb')) #word embedding word_to_id = mappings['word_to_id'] tag_to_id = mappings['tag_to_id'] word_embeds = mappings[ 'word_embeds'] if args.use_pretrained_word_embedding else None word_vocab_size = len(word_to_id) print( ' The total amount of training data:%d\n' % len( train_data ), # Total number of training samples (number of question answer pair) 'The total amount of val data:%d\n' % len(test_data)) acquisition_function = Acquisition(train_data, seed=warm_start_random_seed, answer_count=args.answer_count, cuda_device=args.device[0], batch_size=args.sampling_batch_size, submodular_k=config["submodular_k"]) checkpoint_path = os.path.join(args.result_path, 'active_checkpoint', config["group_name"], sample_method) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) method_result = [ ] # Record the performance results of each method during active learning ####################################### acquire data and retrain ########################################### for i in range(num_acquisitions_round): print("current round:{}".format(i)) #-------------------acquisition--------------------- if i == 0: #first round acq = init_question_num a_m = "random" m_p = "" else: acq = acquire_question_num_per_round a_m = acquire_method m_p = os.path.join(checkpoint_path, 'modelweights') acquisition_function.obtain_data( train_data, model_path=m_p, model_name=model_name, acquire_questions_num=acq, method=a_m, sub_method=sub_acquire_method, unsupervised_method=config["unsupervised_method"]) # -------------------prepare training data--------------------- ''' train_data format: { 'str_words_q': str_words_q, # question word segmentation 'str_words_a': str_words_a, # answer word segmentation 'words_q': words_q, # question word id 'words_a': words_a, # answer word id 'tag': tag, # sample tag id } ''' new_train_index = (acquisition_function.train_index).copy() sorted_train_index = list(new_train_index) sorted_train_index.sort() labeled_train_data = [train_data[i] for i in sorted_train_index] print("Labeled training samples: {}".format( len(acquisition_function.train_index))) # -------------------------------------train-------------------------------------- print('.............Recreate the model...................') if model_name == 'BiLSTM': model = BiLSTM(word_vocab_size, args.word_embedding_dim, args.word_hidden_dim, args.target_size, pretrained=word_embeds, with_sim_features=args.with_sim_feature, cuda_device=args.device[0]) if model_name == 'CNN': model = CNN(word_vocab_size, args.word_embedding_dim, args.word_out_channels, args.target_size, pretrained=word_embeds, cuda_device=args.device[0]) model.cuda(args.device[0]) trainer = Trainer(model, model_name, tag_to_id, answer_count=args.answer_count, cuda_device=args.device[0], sampling_number=args.sampling_number) test_performance = trainer.train_supervisedLearning( args.num_epochs, labeled_train_data, test_data, args.learning_rate, checkpoint_path=checkpoint_path, batch_size=args.batch_size) print('.' * 50) print("Test performance: {}".format(test_performance)) print('*' * 50) #--------------------------Send data for a visual web page------------------------------ max_performance = config[ "max_performance"] if "max_performance" in config else 0 if "group_name" in config: updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance) else: updateLineChart(str(test_performance), sample_method, max=max_performance) # method_result.append(test_performance) # # print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}" # .format(acquire_method, sub_acquire_method, warm_start_random_seed)) # print(method_result) # allMethods_results.append(method_result) shutil.rmtree(checkpoint_path)
def main(args): task_seq = [ # The config for a task: # acquire_method(sub_acquire_method): random(""), no-dete("DAL","BALD"), dete("coreset","entropy",...) # "../../datasets/answer_selection/YahooCQA/data/data-FD/" { # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "random", # "sub_acquire_method": "", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 0, # "sample_method": "Random+400*20-800b-0", # },{ # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "random", # "sub_acquire_method": "", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 16, # "sample_method": "Random+400*20-800b-16", # },{ # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "random", # "sub_acquire_method": "", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 32, # "sample_method": "Random+400*20-800b-32", # },{ # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "random", # "sub_acquire_method": "", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 64, # "sample_method": "Random+400*20-800b-64", # }, # # { # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "no-dete", # "sub_acquire_method": "DAL", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 0, # "sample_method": "No-Deterministic+DAL-400*20-800b+0", # },{ # "model_name": "CNN", # "group_name": "[mlabs]KIM+DAL+1e4trn", # "max_performance": 0.90, # "data_path": "../../datasets/rcv2/", # "acquire_method": "no-dete", # "sub_acquire_method": "DAL", # "unsupervised_method": 'submodular', # "submodular_k": 2, # "num_acquisitions_round": 20, # "init_question_num": 400, # "acquire_question_num_per_round": 400, # "warm_start_random_seed": 16, # "sample_method": "No-Deterministic+DAL-400*20-800b+16", # },{ "model_name": "CNN", "group_name": "[mlabs]KIM+DAL+1e4trn", "max_performance": 0.90, "data_path": "../../datasets/rcv2/", "acquire_method": "no-dete", "sub_acquire_method": "BEL", "unsupervised_method": 'submodular', "submodular_k": 2, "num_acquisitions_round": 20, "init_question_num": 400, "acquire_question_num_per_round": 400, "warm_start_random_seed": 0, "sample_method": "No-Deterministic+BEL-400*20-800b+0", },{ "model_name": "CNN", "group_name": "[mlabs]KIM+DAL+1e4trn", "max_performance": 0.90, "data_path": "../../datasets/rcv2/", "acquire_method": "no-dete", "sub_acquire_method": "BEL", "unsupervised_method": 'submodular', "submodular_k": 2, "num_acquisitions_round": 20, "init_question_num": 400, "acquire_question_num_per_round": 400, "warm_start_random_seed": 64, "sample_method": "No-Deterministic+BEL-400*20-800b+64", }, ] allMethods_results = [] #Record the performance results of each method during active learning for config in task_seq: print("-------------------{}-{}-------------------".format(config["group_name"], config["sample_method"])) ####################################### initial setting ########################################### data_path = config["data_path"] model_name = config["model_name"] if "model_name" in config else 'CNN' num_acquisitions_round = config["num_acquisitions_round"] acquire_method = config["acquire_method"] sub_acquire_method = config["sub_acquire_method"] init_question_num = config["init_question_num"] if "init_question_num" in config else 800 # number of initial training samples acquire_question_num_per_round = config["acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 100 #Number of samples collected per round warm_start_random_seed = config["warm_start_random_seed"] # the random seed for selecting the initial training set sample_method = config["sample_method"] # visual_data_path = os.path.join("result", sample_method + ".txt") loader = Loader() print('model:', model_name) print('dataset:', data_path) print('acquisition method:', acquire_method, "+", sub_acquire_method) if not os.path.exists(args.result_path): os.makedirs(args.result_path) if not os.path.exists(os.path.join(args.result_path, model_name)): os.makedirs(os.path.join(args.result_path, model_name)) if not os.path.exists(os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)): os.makedirs(os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)) data = loader.load_rcv2(data_path) train_data = data['train_points'] val_data = data['test_points'] train_data = train_data[:10000] val_data = val_data[:2000] #word embedding word_embeds = data['embed'] if args.use_pretrained_word_embedding else None word_vocab_size = len(data['vocab'][1]) print(' The total amount of training data:%d\n' %len(train_data), # Total number of training samples (number of question answer pair) 'The total amount of val data:%d\n' %len(val_data), 'The total amount of test data:%d' %len(val_data)) acquisition_function = Acquisition(train_data, seed=warm_start_random_seed, cuda_device=args.device[0], batch_size=args.sampling_batch_size, submodular_k=config["submodular_k"]) checkpoint_path = os.path.join(args.result_path, 'active_checkpoint', config["group_name"], sample_method) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) # with open(visual_data_path, 'a') as f: # print(config["group_name"],sample_method,num_acquisitions_round,sep='\t',file=f) method_result = [] # Record the performance results of each method during active learning ####################################### acquire data and retrain ########################################### for i in range(num_acquisitions_round): print("current round:{}".format(i)) #-------------------acquisition--------------------- if i == 0:#first round acq = init_question_num a_m = "random" m_p = "" else: acq = acquire_question_num_per_round a_m = acquire_method m_p = os.path.join(checkpoint_path, 'modelweights') acquisition_function.obtain_data(train_data, model_path=m_p, model_name=model_name, acquire_num=acq, method=a_m, sub_method=sub_acquire_method, unsupervised_method=config["unsupervised_method"], round = i) # -------------------prepare training data--------------------- ''' train_data format: { 'str_words_q': str_words_q, # question word segmentation 'str_words_a': str_words_a, # answer word segmentation 'words_q': words_q, # question word id 'words_a': words_a, # answer word id 'tag': tag, # sample tag id } ''' new_train_index = (acquisition_function.train_index).copy() sorted_train_index = list(new_train_index) sorted_train_index.sort() labeled_train_data = [train_data[i] for i in sorted_train_index] print("Labeled training samples: {}".format(len(acquisition_function.train_index))) # -------------------------------------train-------------------------------------- print('.............Recreate the model...................') if model_name == 'BiLSTM': model = BiLSTM(word_vocab_size, args.word_embedding_dim, args.word_hidden_dim, args.target_size, pretrained=word_embeds, with_sim_features=args.with_sim_feature, cuda_device=args.device[0], ) if model_name == 'CNN': model = CNN(word_vocab_size, args.word_embedding_dim, args.word_out_channels, args.target_size, pretrained=word_embeds, cuda_device=args.device[0],) model.cuda(args.device[0]) trainer = Trainer(model, args.result_path, model_name, eval_begin=1, cuda_device=args.device[0], top_k= args.top_k ) test_performance = trainer.train_supervisedLearning(args.num_epochs, labeled_train_data, val_data, args.learning_rate, checkpoint_path=checkpoint_path, batch_size=args.batch_size ) print('.' * 50) print("Test performance: {}".format(test_performance)) print('*' * 50) #--------------------------Send data for a visual web page------------------------------ max_performance = config["max_performance"] if "max_performance" in config else 0 if "group_name" in config: updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance) else: updateLineChart(str(test_performance), sample_method, max=max_performance) method_result.append(test_performance) # if not os.path.exists(visual_data_path): # 被zip.sh删掉了,需要重新创建,并写头信息 # with open(visual_data_path, 'a') as f: # print(config["group_name"], sample_method, num_acquisitions_round, sep='\t', file=f) # with open(visual_data_path, 'a') as f: # print("acq round {} : \t {}" # .format(i,test_performance), # file=f) print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}" .format(acquire_method, sub_acquire_method, warm_start_random_seed)) print(method_result) # with open(visual_data_path,'a') as f: # print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}" # .format(acquire_method, sub_acquire_method, warm_start_random_seed), # file=f ) # print(method_result, file=f ) # print('', file=f) allMethods_results.append(method_result) shutil.rmtree(checkpoint_path) with open(config["group_name"]+sample_method.split('+')[1].split('-')[0]+"_detail.pkl",'wb') as f: pkl.dump(acquisition_function.savedData, f)
def main(args): task_seq = [ # The config for a task: # acquire_method(sub_acquire_method): random(""), no-dete("DASL","DAL","BALD"), dete("coreset","entropy",...) { "model_name": "BiLSTM", "group_name": "[2.18-?]BiLSTM+FD+MRR+200+200", "max_performance": 0.80, "data_path": "data/YahooCQA/data-FD/", "acquire_method": "no-dete", "sub_acquire_method": "DASL", "num_acquisitions_round": 37, "init_question_num": 40, "acquire_question_num_per_round": 40, "warm_start_random_seed": 16, "sample_method": "No-Deterministic+DASL2+seed16", }, ] allMethods_results = [ ] #Record the performance results of each method during active learning for config in task_seq: print("-----------------------{}-{}-----------------------".format( config["group_name"], config["sample_method"])) ####################################### initial setting ########################################### data_path = config[ "data_path"] if "data_path" in config else "data/YahooCQA/data-FD/" model_name = config["model_name"] if "model_name" in config else 'CNN' num_acquisitions_round = config["num_acquisitions_round"] acquire_method = config["acquire_method"] sub_acquire_method = config["sub_acquire_method"] init_question_num = config[ "init_question_num"] if "init_question_num" in config else 160 # number of initial training samples acquire_question_num_per_round = config[ "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20 #Number of samples collected per round warm_start_random_seed = config[ "warm_start_random_seed"] # the random seed for selecting the initial training set sample_method = config["sample_method"] loader = Loader() print('model:', model_name) print('dataset:', data_path) print('acquisition method:', acquire_method, "+", sub_acquire_method) if not os.path.exists(args.result_path): os.makedirs(args.result_path) if not os.path.exists(os.path.join(args.result_path, model_name)): os.makedirs(os.path.join(args.result_path, model_name)) if not os.path.exists( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)): os.makedirs( os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)) #### If the data is not compiled, compile; otherwise load directly if (os.path.exists(os.path.join(data_path, 'mappings.pkl')) and os.path.exists(os.path.join(data_path, 'train.pkl')) and os.path.exists(os.path.join(data_path, 'val.pkl')) and os.path.exists(os.path.join(data_path, 'test.pkl'))): mappings = pkl.load( open(os.path.join(data_path, 'mappings.pkl'), 'rb')) train_data = pkl.load( open(os.path.join(data_path, 'train.pkl'), 'rb')) val_data = pkl.load(open(os.path.join(data_path, 'val.pkl'), 'rb')) test_data = pkl.load( open(os.path.join(data_path, 'test.pkl'), 'rb')) else: train_data, val_data, test_data, mappings = loader.load_yahoo( data_path, args.pretrained_word_embedding, args.word_embedding_dim, args.answer_count) pkl.dump(train_data, open(os.path.join(data_path, 'train.pkl'), 'wb')) pkl.dump(val_data, open(os.path.join(data_path, 'val.pkl'), 'wb')) pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'), 'wb')) pkl.dump(mappings, open(os.path.join(data_path, 'mappings.pkl'), 'wb')) #word embedding word_to_id = mappings['word_to_id'] tag_to_id = mappings['tag_to_id'] word_embeds = mappings[ 'word_embeds'] if args.use_pretrained_word_embedding else None word_vocab_size = len(word_to_id) total_sentences = len( train_data ) # Total number of training samples (number of question answer pair) print( 'After training data is loaded, the total amount of training data: %d' % total_sentences) acquisition_function = Acquisition(train_data, seed=warm_start_random_seed, answer_count=args.answer_count) method_result = [ ] # Record the performance results of each method during active learning ####################################### acquire data and retrain ########################################### for i in range(num_acquisitions_round): print("current round:{}".format(i)) #-------------------acquisition--------------------- if i == 0: #first round acq = init_question_num a_m = "random" m_p = "" acquisition_function.obtain_data(train_data, model_path="", acquire=init_question_num, method="random") else: acq = acquire_question_num_per_round a_m = acquire_method m_p = os.path.join(checkpoint_path, 'modelweights') acquisition_function.obtain_data(model_path=m_p, model_name=model_name, data=train_data, acquire=acq, method=a_m, sub_method=sub_acquire_method) # -------------------prepare training data--------------------- ''' train_data的每个元素格式: { 'str_words_q': str_words_q, # question word segmentation 'str_words_a': str_words_a, # answer word segmentation 'words_q': words_q, # question word id 'words_a': words_a, # answer word id 'tag': tag, # sample tag id } ''' new_train_index = (acquisition_function.train_index).copy() sorted_train_index = list(new_train_index) sorted_train_index.sort() labeled_train_data = [train_data[i] for i in sorted_train_index] active_train_data = dict() active_train_data['labeled_train_data'] = labeled_train_data active_train_data[ 'pseudo_train_data'] = acquisition_function.pseudo_train_data print("Labeled training samples: {}".format( len(acquisition_function.train_index))) print("Unlabeled sample remaining: {}".format( len(train_data) - len(acquisition_function.train_index))) # -------------------------------------train-------------------------------------- checkpoint_folder = os.path.join('active_checkpoint', acquire_method, "fixed") checkpoint_path = os.path.join(args.result_path, model_name, checkpoint_folder) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) print('.............Recreate the model...................') if model_name == 'BiLSTM': model = BiLSTM(word_vocab_size, args.word_embedding_dim, args.word_hidden_dim, args.target_size, pretrained=word_embeds, with_sim_features=args.with_sim_feature, double_embedding=args.double_embedding) if model_name == 'CNN': model = CNN(word_vocab_size, args.word_embedding_dim, args.word_out_channels, args.target_size, pretrained=word_embeds, double_embedding=args.double_embedding) model.cuda() trainer = Trainer(model, args.result_path, model_name, tag_to_id, answer_count=args.answer_count) if active_train_data['pseudo_train_data']: noActiveTrain = { "acquisition_function": acquisition_function, "model_path": m_p, "model_name": model_name, "train_data": train_data, "acquire": acq, "method": a_m, "sub_method": sub_acquire_method } test_performance = trainer.train_selfPacedLearning( noActiveTrain, args.num_epochs, active_train_data, val_data, test_data, args.mu, args.learning_rate, checkpoint_folder=checkpoint_folder, batch_size=args.batch_size) else: test_performance = trainer.train_supervisedLearning( args.num_epochs, active_train_data, val_data, test_data, args.learning_rate, checkpoint_folder=checkpoint_folder, batch_size=args.batch_size) print('*' * 50) print("Test performance: {}".format(test_performance)) print('-' * 80) #--------------------------Send data for a visual web page------------------------------ max_performance = config[ "max_performance"] if "max_performance" in config else 0 if "group_name" in config: updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance) else: updateLineChart(str(test_performance), sample_method, max=max_performance)
np.random.seed(0) random.seed(0) def theLoss(x, target): # print('x:',x.size()) # print('target:', target.size()) # return nn.BCELoss()(x.squeeze(2), target) # return nn.BCELoss()(x, target) return nn.MultiLabelSoftMarginLoss()(x, target) eurlex_path = r"../../datasets/eurLex/" rcv2_path = r"../../datasets/rcv2/" Loader = Loader() data = Loader.load_rcv2(datapath=rcv2_path, vocab_size=30000) train_data = data['train_points'] val_data = data['test_points'] train_data = train_data[:8000] # too small the valdata amount so ... val_data = val_data[:2000] model = CNN(word_vocab_size=30000, word_embedding_dim=300, word_out_channels=200, output_size=103, pretrained=data['embed']) model = nn.DataParallel(model).cuda()