def main(): logger = Logger(args.logdir) params = dict() params['batch_size'] = args.batch_size params['data_dir'] = args.path_to_train_data params['major'] = 'users' params['itemIdInd'] = 1 params['userIdInd'] = 0 print("Loading training data") data_layer = input_layer.UserItemRecDataProvider(params=params) print("Data loaded") print("Total items found: {}".format(len(data_layer.data.keys()))) print("Vector dim: {}".format(data_layer.vector_dim)) print("Loading eval data") eval_params = copy.deepcopy(params) # must set eval batch size to 1 to make sure no examples are missed eval_params['data_dir'] = args.path_to_eval_data eval_data_layer = input_layer.UserItemRecDataProvider( params=eval_params, user_id_map=data_layer.userIdMap, # the mappings are provided item_id_map=data_layer.itemIdMap) eval_data_layer.src_data = data_layer.data rencoder = model.AutoEncoder( layer_sizes=[data_layer.vector_dim] + [int(l) for l in args.hidden_layers.split(',')], nl_type=args.non_linearity_type, is_constrained=args.constrained, dp_drop_prob=args.drop_prob, last_layer_activations=not args.skip_last_layer_nl) model_checkpoint = args.logdir + "/model" path_to_model = Path(model_checkpoint) if path_to_model.is_file(): print("Loading model from: {}".format(model_checkpoint)) rencoder.load_state_dict(torch.load(model_checkpoint)) print('######################################################') print('######################################################') print('############# AutoEncoder Model: #####################') print(rencoder) print('######################################################') print('######################################################') gpu_ids = [int(g) for g in args.gpu_ids.split(',')] print('Using GPUs: {}'.format(gpu_ids)) if len(gpu_ids) > 1: rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids) rencoder = rencoder.cuda() if args.optimizer == "adam": optimizer = optim.Adam(rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "adagrad": optimizer = optim.Adagrad(rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "momentum": optimizer = optim.SGD(rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5) elif args.optimizer == "rmsprop": optimizer = optim.RMSprop(rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: raise ValueError('Unknown optimizer kind') t_loss = 0.0 t_loss_denom = 0.0 global_step = 0 if args.noise_prob > 0.0: dp = nn.Dropout(p=args.noise_prob) for epoch in range(args.num_epochs): print('Doing epoch {} of {}'.format(epoch, args.num_epochs)) e_start_time = time.time() rencoder.train() total_epoch_loss = 0.0 denom = 0.0 if args.optimizer == "momentum": scheduler.step() for i, mb in enumerate(data_layer.iterate_one_epoch()): inputs = Variable(mb.cuda().to_dense()) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() global_step += 1 t_loss += loss.data[0] t_loss_denom += 1 if i % args.summary_frequency == 0: print('[%d, %5d] RMSE: %.7f' % (epoch, i, sqrt(t_loss / t_loss_denom))) logger.scalar_summary("Training_RMSE", sqrt(t_loss / t_loss_denom), global_step) t_loss = 0 t_loss_denom = 0.0 log_var_and_grad_summaries(logger, rencoder.encode_w, global_step, "Encode_W") log_var_and_grad_summaries(logger, rencoder.encode_b, global_step, "Encode_b") if not rencoder.is_constrained: log_var_and_grad_summaries(logger, rencoder.decode_w, global_step, "Decode_W") log_var_and_grad_summaries(logger, rencoder.decode_b, global_step, "Decode_b") total_epoch_loss += loss.data[0] denom += 1 #if args.aug_step > 0 and i % args.aug_step == 0 and i > 0: if args.aug_step > 0: # Magic data augmentation trick happen here for t in range(args.aug_step): inputs = Variable(outputs.data) if args.noise_prob > 0.0: inputs = dp(inputs) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() e_end_time = time.time() print( 'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' .format(epoch, e_end_time - e_start_time, sqrt(total_epoch_loss / denom))) logger.scalar_summary("Training_RMSE_per_epoch", sqrt(total_epoch_loss / denom), epoch) logger.scalar_summary("Epoch_time", e_end_time - e_start_time, epoch) if epoch % 3 == 0 or epoch == args.num_epochs - 1: eval_loss = do_eval(rencoder, eval_data_layer) print('Epoch {} EVALUATION LOSS: {}'.format(epoch, eval_loss)) logger.scalar_summary("EVALUATION_RMSE", eval_loss, epoch) print("Saving model to {}".format(model_checkpoint + ".epoch_" + str(epoch))) torch.save(rencoder.state_dict(), model_checkpoint + ".epoch_" + str(epoch)) print("Saving model to {}".format(model_checkpoint + ".last")) torch.save(rencoder.state_dict(), model_checkpoint + ".last")
def test_adagrad_sparse(self): self._test_rosenbrock_sparse( lambda params: optim.Adagrad(params, lr=1e-1))
torch.matmul(self.Wo, embedding_tensor) + self.bo) u_value = torch.tanh( torch.matmul(self.Wu, embedding_tensor) + self.bu) cell = i_gate * u_value hidden = o_gate * torch.tanh(cell) logits = (torch.matmul(self.Why, hidden) + self.by).view( 1, output_size) target = Var(torch.LongTensor([int(scores[index])])) loss = F.nll_loss(F.log_softmax(logits, dim=1), target) return (loss, hidden, cell) return rec(0)[0] net = TreeNet() opt = optim.Adagrad(net.parameters(), lr=learning_rate) epocNum = 6 loopStart = time.time() loss_save = [] for epoc in range(epocNum): total_loss = 0 for n in range(tree_data_size): opt.zero_grad() loss = net.forward(scores[n], words[n], lchs[n], rchs[n]) total_loss += loss.data[0] loss.backward() opt.step() loss_save.append(total_loss / tree_data_size) print("epoc {}, average_loss {}".format(epoc, total_loss / tree_data_size))
def configure_optimizers__adagrad(self): optimizer = optim.Adagrad(self.parameters(), lr=self.learning_rate) return optimizer
def adagrad_constructor(params): adagrad = optim.Adagrad(params, lr=1e-1) return StochasticWeightAveraging(adagrad, swa_start=1000, swa_freq=1, swa_lr=1e-2)
nn.ReLU(), nn.BatchNorm1d(h_dim), nn.Dropout(), nn.Linear(h_dim, input_dim), nn.ReLU(), nn.BatchNorm1d(input_dim)) def forward(self, x): output = self.EnE(x) Xhat = self.DeE(output) return Xhat, output torch.cuda.manual_seed_all(42) AutoencoderE = AE() solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE) rec_criterion = torch.nn.MSELoss() for it in range(epoch): epoch_cost4 = 0 num_minibatches = int(n_sampE / mb_size) for i, (dataE, dataM, dataC, target) in enumerate(trainLoader): AutoencoderE.train() Dat_train = torch.cat((dataE, dataM, dataC), 1) Dat_hat, ZX = AutoencoderE(Dat_train) loss = rec_criterion(Dat_hat, Dat_train)
def step(itr, step_size): if itr%(2*step_size) < step_size: return (itr%(2*step_size)) / step_size return (2*step_size-(itr%(2*step_size)))/step_size return lr_lambda # In[ ]: loss_function = nn.CrossEntropyLoss() clr = cyclical_lr(500, 1e-5, 1e-2) optimizer = optim.Adagrad(parameters_to_update, lr=1.0) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) # In[ ]: def train_model(model, data_loaders, loss_function, optimizer, num_epochs, device): start = time.time() val_accuracies = list() max_val_accuracy = 0.0 optimal_model_parameters = copy.deepcopy(model.state_dict())
def main(args): episodes = split_data(args.data) #episodes = episodes[:len(episodes)//30] # for debug valid_rate = 0.15 episodes = np.array(episodes, dtype=object) valid_num = int(valid_rate * len(episodes)) valid_episodes = episodes[:valid_num] episodes = episodes[valid_num:] vocab2index, index2vocab, embedding_weight, embedding_dim = build_vocab( episodes, args.embedding, 100, train_oov=False) episodes_text2index(episodes, vocab2index) episodes_text2index(valid_episodes, vocab2index) batch_size = args.batch_size #batch_list = get_batch_list(episodes, batch_size) #valid_batch_list = get_batch_list(valid_episodes, batch_size) save_round = 1 date = datetime.datetime.now().strftime("%d-%H-%M") save_path = 'model/model_{}'.format(date) print('save_path = {}'.format(save_path)) if not os.path.exists(save_path): os.makedirs(save_path, exist_ok=True) with open(os.path.join(save_path, 'vocab.pickle'), 'wb') as f: pickle.dump({ 'vocab2index': vocab2index, 'index2vocab': index2vocab }, f) log_file = codecs.open(os.path.join(save_path, 'log'), 'w') embedding_weight = torch.Tensor(embedding_weight) #oracle = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA) #oracle.load_state_dict(torch.load(oracle_state_dict_path)) #oracle_samples = torch.load(oracle_samples_path).type(torch.LongTensor) # a new oracle can be generated by passing oracle_init=True in the generator constructor # samples for the new oracle can be generated using helpers.batchwise_sample() gen = generator.Generator(embedding_dim, GEN_HIDDEN_DIM, len(vocab2index), MAX_SEQ_LEN, embedding_weight, gpu=CUDA) dis = discriminator.Discriminator(embedding_dim, DIS_HIDDEN_DIM, len(vocab2index), MAX_SEQ_LEN, embedding_weight, gpu=CUDA) if CUDA: #oracle = oracle.cuda() gen = gen.cuda() dis = dis.cuda() #oracle_samples = oracle_samples.cuda() #for parameters in gen.parameters(): # print(parameters) # GENERATOR MLE TRAINING print('Starting Generator MLE Training...') gen_optimizer = optim.Adam(filter(lambda p: p.requires_grad, gen.parameters()), lr=1e-2) train_generator_MLE(gen, gen_optimizer, episodes, valid_episodes, batch_size, MLE_TRAIN_EPOCHS) # torch.save(gen.state_dict(), pretrained_gen_path) # gen.load_state_dict(torch.load(pretrained_gen_path)) # PRETRAIN DISCRIMINATOR print('\nStarting Discriminator Training...') dis_optimizer = optim.Adagrad( filter(lambda p: p.requires_grad, dis.parameters())) train_discriminator(dis, dis_optimizer, episodes, valid_episodes, gen, batch_size, 1, 1) # torch.save(dis.state_dict(), pretrained_dis_path) # dis.load_state_dict(torch.load(pretrained_dis_path)) # ADVERSARIAL TRAINING #print('\nStarting Adversarial Training...') #oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN, #start_letter=START_LETTER, gpu=CUDA) #print('\nInitial Oracle Sample Loss : %.4f' % oracle_loss) for epoch in range(ADV_TRAIN_EPOCHS): print('\n--------\nEPOCH %d\n--------' % (epoch + 1)) # TRAIN GENERATOR print('\nAdversarial Training Generator : ', end='') sys.stdout.flush() train_generator_PG(gen, gen_optimizer, dis, batch_size, episodes, 1, 20) # TRAIN DISCRIMINATOR print('\nAdversarial Training Discriminator : ') train_discriminator(dis, dis_optimizer, episodes, valid_episodes, gen, batch_size, 1, 1)
def main(args): """This major function controls finding data, splitting train and validation data, building datasets, building dataloaders, building a model, loading a model, training a model, testing a model, and writing a submission""" best_acc = 0 # Specify the GPUs to use print("Finding GPUs...") gpus = list(range(torch.cuda.device_count())) print('--- GPUS: {} ---'.format(str(gpus))) if "train" in args.modes.lower(): # List the trainval folders print("Load trainval data...") trainval_folder_names = [ x for x in os.listdir(args.trainval_data_path) if os.path.isdir(os.path.join(args.trainval_data_path, x)) ] more_train_img_names = [ x for x in os.listdir( os.path.join(args.more_train_data_path, 'JPEGImages')) ] # Figure out how many folders to use for training and validation num_train_folders = int( len(trainval_folder_names) * args.trainval_split_percentage) num_more_train_imgs = len(more_train_img_names) num_val_folders = len(trainval_folder_names) - num_train_folders print("Building dataset split...") print("--- Number of train folders: {} ---".format(num_train_folders)) print("--- Number of additional train images: {} ---".format( num_more_train_imgs)) print("--- Number of val folders: {} ---".format(num_val_folders)) # Choose the training and validation folders random.shuffle( trainval_folder_names) # TODO if loading a model, be careful train_folder_names = trainval_folder_names[:num_train_folders] val_folder_names = trainval_folder_names[num_train_folders:] # Make dataloaders print("Making train and val dataloaders...") train_loader = make_dataloader(train_folder_names, args.trainval_data_path, args.batch_size, args.task, args.modes) more_train_loader = make_dataloader(more_train_img_names, args.more_train_data_path, args.batch_size, args.task, args.modes, xml=True) val_loader = make_dataloader(val_folder_names, args.trainval_data_path, args.batch_size, args.task, args.modes) # Build and load the model model = build_model(args, gpus) model = load_model(args, model, args.load_epoch) # Declare the optimizer, learning rate scheduler, and training loops. Note that models are saved to the current directory. print("Creating optimizer and scheduler...") if args.task == 4: if args.optimizer_string == 'RMSprop': optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_string == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_string == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_string == 'Adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_string == 'Adadelta': optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.3, patience=10, verbose=True) else: optimizer = optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay, amsgrad=True) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5, verbose=True) # This trainer class does all the work print("Instantiating runner...") if args.task == 2: runner = Runner(model, optimizer, sum_mse, args.task, args.save_dir) else: runner = Runner(model, optimizer, sum_cross_entropy, args.task, args.save_dir) best_acc = 0 if "train" in args.modes.lower(): print("Begin training... {}, lr:{} + wd:{} + opt:{} + bs:{} ".format( str(args.model), str(args.lr), str(args.weight_decay), str(args.optimizer_string), str(args.batch_size))) best_acc = runner.loop(args.num_epoch, train_loader, more_train_loader, val_loader, scheduler, args.batch_size) args.save_path = save_path = args.save_dir.split( '/')[-1] + '-' + args.model + '-' + str(best_acc) + '-' + str( args.lr) + '-' + str(args.weight_decay) + '-' + str( args.optimizer_string) + '-' + str(args.batch_size) if "test" in args.modes.lower(): print("Load test data...") # Get test folder names test_folder_names = [ x for x in os.listdir(args.test_data_path) if os.path.isdir(os.path.join(args.test_data_path, x)) ] # Switch to eval mode model = build_model(args, gpus) model = load_model(args, model, 9999) model.eval() # Make test dataloader print("Making test dataloaders...") test_loader = make_dataloader(test_folder_names, args.test_data_path, args.batch_size, args.task, 'test') # Run the dataloader through the neural network print("Conducting a test...") _, _, outputs, logits = runner.test(test_loader, args.batch_size) # Write the submission to CSV print("Writing a submission to \"csvs/{}.csv\"...".format(save_path)) if args.task == 2: with open('csvs/' + save_path + '.csv', 'w') as sub: sub.write('guid/image/axis,value\n') for name, val in outputs: # Build path mod_name = name.split('/')[5] + '/' + name.split( '/')[6].split('_')[0] x = val[0] y = val[1] z = val[2] # Print and write row sub.write(mod_name + '/x,' + str(x) + '\n') sub.write(mod_name + '/y,' + str(y) + '\n') sub.write(mod_name + '/z,' + str(z) + '\n') np.save('logits/' + save_path + '.npy', np.array([l for p, l in logits])) else: print( "writing a submission to \"csvs/{}.csv\"...".format(save_path)) with open('csvs/' + save_path + '.csv', 'w') as sub: sub.write('guid/image,label\n') for name, val in outputs: # Build path mod_name = name.split('/')[4] + '/' + name.split( '/')[5].split('_')[0] mod_val = int(list_mapping[int(np.argmax(val))]) # Print and write row sub.write(mod_name + ',' + str(mod_val) + '\n') np.save('logits/' + save_path + '.npy', np.array([l for p, l in logits])) # TODO average multiple logits results # This function loads these logits but they should be reshaped with .reshape(-1, 23) # test_logits = np.load('logits/'+save_path+'.npy') #print("0s: {}".format(str(np.count_nonzero(test_logits == 0.0)))) #print("1s: {}".format(str(np.count_nonzero(test_logits == 1.0)))) #print("2s: {}".format(str(np.count_nonzero(test_logits == 2.0)))) print('Done!')
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) if data.sentence_classification: model = SentClassifier(data) else: model = SeqLabel(data) # loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(1) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True, data.sentence_classification) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask, data.sentence_classification) right_token += right whole_token += whole # print("loss:",loss.item()) sample_loss += loss.item() total_loss += loss.item() if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) # model_name = data.model_dir +'.'+ str(idx) + ".model" model_name = data.model_dir + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc)) gc.collect()
model = resnet.ResNet18().cuda() if args.method == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) elif args.method == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.method == 'hadam': import Hadam optimizer = Hadam.Hadam(model.parameters(), lr=args.lr, fraction=args.fraction, eta=args.eta, gamma=args.gamma, bias_correction=args.bias_correction) elif args.method == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr) model.train() for epoch in range(start_epoch + 1, args.Nepoch + 1): for batch_idx, (data, target) in enumerate(train_loader): # Get Samples data, target = Variable(data).cuda(), Variable(target).cuda() # Init optimizer.zero_grad() # Predict y_pred = model(data) # Calculate loss loss = F.cross_entropy(y_pred, target)
def prepare(args): global trainloader global testloader global net global criterion global optimizer # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) #classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') # Model print('==> Building model..') if args['model'] == 'vgg': net = VGG('VGG19') if args['model'] == 'resnet18': net = ResNet18() if args['model'] == 'googlenet': net = GoogLeNet() if args['model'] == 'densenet121': net = DenseNet121() if args['model'] == 'mobilenet': net = MobileNet() if args['model'] == 'dpn92': net = DPN92() if args['model'] == 'shufflenetg2': net = ShuffleNetG2() if args['model'] == 'senet18': net = SENet18() net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() #optimizer = optim.SGD(net.parameters(), lr=args['lr'], momentum=0.9, weight_decay=5e-4) if args['optimizer'] == 'SGD': optimizer = optim.SGD(net.parameters(), lr=args['lr'], momentum=0.9, weight_decay=5e-4) if args['optimizer'] == 'Adadelta': optimizer = optim.Adadelta(net.parameters(), lr=args['lr']) if args['optimizer'] == 'Adagrad': optimizer = optim.Adagrad(net.parameters(), lr=args['lr']) if args['optimizer'] == 'Adam': optimizer = optim.Adam(net.parameters(), lr=args['lr']) if args['optimizer'] == 'Adamax': optimizer = optim.Adam(net.parameters(), lr=args['lr'])
def init_optimizers(optimizer: str, model_named_parameters: Generator, learning_rate: float, adam_epsilon: float, weight_decay): """ @param optimizer: parameter to choose the optimizer @param model_named_parameters: model parameters @param learning_rate: learning rate @param adam_epsilon: adam epsilon value @param weight_decay: weight decay @return: return optimizer """ no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model_named_parameters if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model_named_parameters if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] if optimizer.lower() == "adamax": optimizer = optim.Adamax(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) elif optimizer.lower() == "adamw": optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) elif optimizer.lower() == "adam": optimizer = optim.Adam(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) elif optimizer.lower() == "radam": optimizer = RAdam(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) elif optimizer.lower() == "adadelta": optimizer = optim.Adadelta(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) elif optimizer.lower() == "adagrad": optimizer = optim.Adagrad(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) else: optimizer = optim.SGD(optimizer_grouped_parameters, lr=learning_rate) return optimizer
def train(args,data,model): logger.info("Training modules...") model.show_model_summary(logger) print("Training Parameters:%s",args) if args.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum,weight_decay=args.l2) elif args.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.l2) elif args.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.l2) elif args.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=args.l2) elif args.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) else: print("Optimizer illegal: %s"%(args.optimizer)) exit(1) best_dev = 0 ## start training for idx in range(args.iteration): epoch_start = time.time() temp_start = epoch_start #print("Epoch: %s/%s" %(idx,modules.iteration)) if args.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, args.lr_decay, args.lr) instance_count = 0 sample_loss = 0 total_loss = 0 sample_whole_token = 0 sample_H2B_high_right_token = 0 sample_H2B_bot_right_token = 0 sample_H2B_all_right_token = 0 sample_B2H_high_right_token = 0 sample_B2H_bot_right_token = 0 sample_B2H_all_right_token = 0 random.shuffle(data.train_Ids) model.train() model.zero_grad() batch_size = args.batch_size train_num = len(data.train_Ids) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_hlabel,batch_llabel, mask =\ batchify_sequence_labeling_with_label(instance, args.gpu,args.max_sent_length,True) instance_count += 1 if args.model == 'DUAL': H2BH_loss, H2BB_loss, B2HB_loss, B2HH_loss, H2BH_tag_seqs, H2BB_tag_seqs, B2HB_tag_seqs, B2HH_tag_seqs = model.calculate_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_hlabel, batch_llabel,mask) H2B_whole, H2B_high_right, H2B_bot_right, H2B_all_right = predict_check(H2BH_tag_seqs, H2BB_tag_seqs,batch_hlabel, batch_llabel,mask) sample_whole_token += H2B_whole sample_H2B_high_right_token += H2B_high_right sample_H2B_bot_right_token += H2B_bot_right sample_H2B_all_right_token += H2B_all_right _, B2H_high_right, B2H_bot_right, B2H_all_right = predict_check(B2HH_tag_seqs, B2HB_tag_seqs,batch_hlabel, batch_llabel,mask) sample_B2H_high_right_token += B2H_high_right sample_B2H_bot_right_token += B2H_bot_right sample_B2H_all_right_token += B2H_all_right loss = args.H2BH*H2BH_loss + args.H2BB*H2BB_loss + args.B2HB*B2HB_loss + args.B2HH*B2HH_loss elif args.model == 'H2B': H2BH_loss, H2BB_loss, H2BH_tag_seqs, H2BB_tag_seqs = model.calculate_loss(batch_word, batch_wordlen,batch_char, batch_charlen, batch_charrecover,batch_hlabel, batch_llabel,mask) H2B_whole, H2B_high_right, H2B_bot_right, H2B_all_right = predict_check(H2BH_tag_seqs, H2BB_tag_seqs, batch_hlabel, batch_llabel, mask) sample_whole_token += H2B_whole sample_H2B_high_right_token += H2B_high_right sample_H2B_bot_right_token += H2B_bot_right sample_H2B_all_right_token += H2B_all_right loss = args.H2BH * H2BH_loss + args.H2BB * H2BB_loss elif args.model == 'B2H': B2HB_loss, B2HH_loss, B2HB_tag_seqs, B2HH_tag_seqs = model.calculate_loss(batch_word, batch_wordlen,batch_char, batch_charlen, batch_charrecover,batch_hlabel, batch_llabel,mask) B2H_whole, B2H_high_right, B2H_bot_right, B2H_all_right = predict_check(B2HH_tag_seqs, B2HB_tag_seqs,batch_hlabel, batch_llabel,mask) sample_whole_token += B2H_whole sample_B2H_high_right_token += B2H_high_right sample_B2H_bot_right_token += B2H_bot_right sample_B2H_all_right_token += B2H_all_right loss = args.B2HB * B2HB_loss + args.B2HH * B2HH_loss sample_loss += loss.item() total_loss += loss.item() #if end%(10*args.batch_size) == 0: if end % (10*args.batch_size) == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(" Instance: %s; Time: %.2fs; loss: %.4f;Token Num:%s ||| H2B Hacc:%.4f;Bacc: %.4f;" "ALLacc:%.4f|||||B2H Hacc:%.4f;Bacc:%.4f;ALLacc:%.4f" % (end, temp_cost, sample_loss, sample_whole_token, (sample_H2B_high_right_token + 0.)/ sample_whole_token,(sample_H2B_bot_right_token + 0.)/ sample_whole_token, (sample_H2B_all_right_token + 0.)/ sample_whole_token,(sample_B2H_high_right_token + 0.)/ sample_whole_token, (sample_B2H_bot_right_token + 0.)/ sample_whole_token,(sample_B2H_all_right_token + 0.) / sample_whole_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) sys.stdout.flush() sample_loss = 0 sample_whole_token = 0 sample_H2B_high_right_token = 0 sample_H2B_bot_right_token = 0 sample_H2B_all_right_token = 0 sample_B2H_high_right_token = 0 sample_B2H_bot_right_token = 0 sample_B2H_all_right_token = 0 loss.backward() if args.clip: torch.nn.utils.clip_grad_norm_(model.parameters(),args.clip) optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f;Token Num:%s ||| H2B Hacc:%.4f;Bacc: %.4f;" "ALLacc:%.4f|||||B2H Hacc:%.4f;Bacc:%.4f;ALLacc:%.4f" % (end, temp_cost, sample_loss, sample_whole_token, (sample_H2B_high_right_token + 0.) / sample_whole_token,(sample_H2B_bot_right_token + 0.) / sample_whole_token, (sample_H2B_all_right_token + 0.) / sample_whole_token,(sample_B2H_high_right_token + 0.) / sample_whole_token, (sample_B2H_bot_right_token + 0.) / sample_whole_token,(sample_B2H_all_right_token + 0.) / sample_whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start logger.info("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss)) print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) #print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") exit(1) # continue if args.model == 'DUAL': H2B_evals,B2H_evals, H2B_results,B2H_results= evaluate(data, model,logger, "dev",best_dev=best_dev) current_score = B2H_evals[2] elif args.model == 'H2B': H2B_evals, _,_,_ = evaluate(data, model,logger, "dev",best_dev=best_dev) current_score = H2B_evals[2] elif args.model == 'B2H': B2H_evals, _,_,_ = evaluate(data, model,logger, "dev",best_dev=best_dev) current_score = B2H_evals[2] if current_score > best_dev: print("New f score %f > previous %f ,Save current best modules in file:%s" % (current_score,best_dev,args.load_model_name)) torch.save(model.state_dict(), args.load_model_name) best_dev = current_score gc.collect()
def train(data, name, save_dset, save_model_dir, seg=True, ignore=False, cove_flag=False): print('---Training model---') data.show_data_summary() save_data_name = save_dset save_data_setting(data, save_data_name) model = NER(data, cove_flag) if data.gpu: model = model.cuda() if data.optim.lower() == 'adam': optimizer = optim.Adam(model.parameters()) elif data.optim.lower() == 'rmsprop': optimizer = optim.RMSprop(model.parameters()) elif data.optim.lower() == 'adadelta': optimizer = optim.Adadelta(model.parameters()) elif data.optim.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters()) elif data.optim.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=data.lr, momentum=data.momentum) else: optimizer = None print('Error optimizer selection, please check config.optim.') exit(1) best_dev = -1 epoch = data.iteration vis = visdom.Visdom() losses = [] all_F = [[0., 0., 0.]] dict_F = {} label_F = [] for idx in range(epoch): epoch_start = time.time() tmp_start = epoch_start print('Epoch: %s/%s' % (idx, epoch)) if data.optim.lower() == 'sgd': optimizer = lr_decay(optimizer, idx, data.lr_decay, data.lr) instance_count = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_ids) model.train() batch_size = data.batch_size train_num = len(data.train_ids) total_batch = train_num // batch_size for batch_id in range(total_batch): model.zero_grad() start = batch_id * batch_size end = (batch_id + 1) * batch_size # if end > train_num: # break # #end = train_num instance = data.train_ids[start:end] # if not instance: # continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: tmp_time = time.time() tmp_cost = tmp_time - tmp_start tmp_start = tmp_time print( '\tInstance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f' % (end, tmp_cost, sample_loss, right_token, whole_token, (right_token + 0.0) / whole_token)) sys.stdout.flush() losses.append(sample_loss / 500.0) Lwin = 'Loss of ' + name vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), win=Lwin, opts={ 'title': Lwin, 'legend': ['loss'] }) sample_loss = 0 loss.backward() if data.clip: torch.nn.utils.clip_grad_norm(model.parameters(), 10.0) optimizer.step() # tmp_time = time.time() # tmp_cost = tmp_time - tmp_start # print('\tInstance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f' # % (end, tmp_cost, sample_loss, right_token, whole_token, (right_token+0.0) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( 'Epoch: %s training finished. Time: %.2fs, speed: %.2ft/s, total_loss: %s' % (idx, epoch_cost, train_num / epoch_cost, total_loss)) speed, acc, p, r, f_dev, dict_dev = evaluate(data, model, 'dev', ignore=ignore) dev_finish = time.time() dev_cost = dev_finish - epoch_finish if seg: current_score = f_dev print( 'Dev: time: %.2fs, speed: %.2ft/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f' % (dev_cost, speed, acc, p, r, f_dev)) else: current_score = acc print('Dev: time: %.2fs, speed: %.2ft/s; acc: %.4f' % (dev_cost, speed, acc)) if current_score > best_dev: if seg: print('Exceed previous best f score: ', best_dev) else: print('Exceed previous best acc score: ', best_dev) model_name = save_model_dir + '/' + name torch.save(model.state_dict(), model_name) best_dev = current_score with open( save_model_dir + '/' + name + '_eval_' + str(idx) + '.txt', 'w') as f: if seg: f.write('acc: %.4f, p: %.4f, r: %.4f, f: %.4f' % (acc, p, r, best_dev)) f.write('acc: %.4f, p: %.4f' % (acc, p)) else: f.write('acc: %.4f' % acc) speed, acc, p, r, f_test, dict_test = evaluate(data, model, 'test', ignore=ignore) test_finish = time.time() test_cost = test_finish - epoch_finish if seg: print( 'Test: time: %.2fs, speed: %.2ft/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f' % (test_cost, speed, acc, p, r, f_test)) else: print('Test: time: %.2fs, speed: %.2ft/s; acc: %.4f' % (test_cost, speed, acc)) speed, acc, p, r, f_train, dict_train = evaluate(data, model, 'train', ignore=ignore) all_F.append([f_train * 100.0, f_dev * 100.0, f_test * 100.0]) Fwin = 'F1-score of ' + name + ' {train, dev, test}' vis.line(np.array(all_F), X=np.array([i for i in range(len(all_F))]), win=Fwin, opts={ 'title': Fwin, 'legend': ['train', 'dev', 'test'] }) if dict_train: for key, value in dict_train.items(): if key not in label_F: dict_F[key] = [[0., 0., 0.]] label_F.append(key) dict_F[key].append([ dict_train[key] * 100.0, dict_dev[key] * 100.0, dict_test[key] * 100.0 ]) Fwin = 'F1-score of ' + name + '_' + key + ' {train, dev, test}' vis.line(np.array(dict_F[key]), X=np.array([i for i in range(len(dict_F[key]))]), win=Fwin, opts={ 'title': Fwin, 'legend': ['train', 'dev', 'test'] }) gc.collect()
def main(args): np.random.seed(args.seed) th.manual_seed(args.seed) th.cuda.manual_seed(args.seed) cuda = args.gpu >= 0 device = th.device('cuda:{}'.format( args.gpu)) if cuda else th.device('cpu') if cuda: th.cuda.set_device(args.gpu) trainset = data.SST() train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, collate_fn=batcher(device), shuffle=True, num_workers=0) model = TreeLSTM(trainset.num_vocabs, args.x_size, args.h_size, trainset.num_classes, args.dropout, cell_type='childsum' if args.child_sum else 'nary', pretrained_emb=trainset.pretrained_emb).to(device) print(model) params_ex_emb = [ x for x in list(model.parameters()) if x.requires_grad and x.size(0) != trainset.num_vocabs ] params_emb = list(model.embedding.parameters()) optimizer = optim.Adagrad([{ 'params': params_ex_emb, 'lr': args.lr, 'weight_decay': args.weight_decay }, { 'params': params_emb, 'lr': 0.1 * args.lr }]) for epoch in range(args.epochs): model.train() count = 0 t_epoch = time.time() for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, batch.label, reduction='elementwise_mean') optimizer.zero_grad() loss.backward() optimizer.step() count += 1 if cuda: th.cuda.synchronize() t_epoch_end = time.time() print('Epoch {:05d} batch {} training time {:.4f}s'.format( epoch, count, t_epoch_end - t_epoch))
def test_optimizer(data): print('---Test Optimizers---') model_SGD = NER(data) model_Adam = NER(data) model_RMSprop = NER(data) model_Adadelta = NER(data) model_Adagrad = NER(data) if data.gpu: model_SGD = model_SGD.cuda() model_Adam = model_Adam.cuda() model_RMSprop = model_RMSprop.cuda() model_Adadelta = model_Adadelta.cuda() model_Adagrad = model_Adagrad.cuda() optimizer_SGD = optim.SGD(model_SGD.parameters(), lr=data.lr, momentum=data.momentum) optimizer_Adam = optim.Adam(model_Adam.parameters()) optimizer_RMSprop = optim.RMSprop(model_RMSprop.parameters()) optimizer_Adadelta = optim.Adadelta(model_Adadelta.parameters()) optimizer_Adagrad = optim.Adagrad(model_Adagrad.parameters()) epoch = data.iteration vis = visdom.Visdom() losses = [] train_F = [[0., 0., 0., 0., 0.]] dev_F = [[0., 0., 0., 0., 0.]] test_F = [[0., 0., 0., 0., 0.]] for idx in range(epoch): epoch_start = time.time() print('Epoch: %s/%s' % (idx, epoch)) optimizer_SGD = lr_decay(optimizer_SGD, idx, data.lr_decay, data.lr) instance_count = 0 sample_loss_SGD = 0 sample_loss_Adam = 0 sample_loss_RMSprop = 0 sample_loss_Adadelta = 0 sample_loss_Adagrad = 0 random.shuffle(data.train_ids) model_SGD.train() model_Adam.train() model_RMSprop.train() model_Adadelta.train() model_Adagrad.train() model_SGD.zero_grad() model_Adam.zero_grad() model_RMSprop.zero_grad() model_Adadelta.zero_grad() model_Adagrad.zero_grad() batch_size = data.batch_size train_num = len(data.train_ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_ids[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.gpu) instance_count += 1 loss_SGD, tag_seq_SGD = model_SGD.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) loss_Adam, tag_seq_Adam = model_Adam.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) loss_RMSprop, tag_seq_RMSprop = model_RMSprop.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) loss_Adadelta, tag_seq_Adadelta = model_Adadelta.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) loss_Adagrad, tag_seq_Adagrad = model_Adagrad.neg_log_likelihood_loss( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) sample_loss_SGD += loss_SGD.data[0] sample_loss_Adam += loss_Adam.data[0] sample_loss_RMSprop += loss_RMSprop.data[0] sample_loss_Adadelta += loss_Adadelta.data[0] sample_loss_Adagrad += loss_Adagrad.data[0] if end % 500 == 0: sys.stdout.flush() losses.append([ sample_loss_SGD / 50.0, sample_loss_Adam / 50.0, sample_loss_RMSprop / 50.0, sample_loss_Adadelta / 50.0, sample_loss_Adagrad / 50.0 ]) Lwin = 'Loss of Optimizers' vis.line(np.array(losses), X=np.array([i for i in range(len(losses))]), win=Lwin, opts={ 'title': Lwin, 'legend': ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad'] }) sample_loss_SGD = 0 sample_loss_Adam = 0 sample_loss_RMSprop = 0 sample_loss_Adadelta = 0 sample_loss_Adagrad = 0 loss_SGD.backward() loss_Adam.backward() loss_RMSprop.backward() loss_Adadelta.backward() loss_Adagrad.backward() # if data.clip: # torch.nn.utils.clip_grad_norm(model.parameters(), 10.0) optimizer_SGD.step() optimizer_Adam.step() optimizer_RMSprop.step() optimizer_Adadelta.step() optimizer_Adagrad.step() model_SGD.zero_grad() model_Adam.zero_grad() model_RMSprop.zero_grad() model_Adadelta.zero_grad() model_Adagrad.zero_grad() epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print('Epoch: %s training finished. Time: %.2fs, speed: %.2ft/s' % (idx, epoch_cost, train_num / epoch_cost)) speed, acc, p, r, f_train_SGD, _ = evaluate(data, model_SGD, 'train') speed, acc, p, r, f_train_Adam, _ = evaluate(data, model_Adam, 'train') speed, acc, p, r, f_train_RMSprop, _ = evaluate( data, model_RMSprop, 'train') speed, acc, p, r, f_train_Adadelta, _ = evaluate( data, model_Adadelta, 'train') speed, acc, p, r, f_train_Adagrad, _ = evaluate( data, model_Adagrad, 'train') train_F.append([ f_train_SGD * 100, f_train_Adam * 100, f_train_RMSprop * 100, f_train_Adadelta * 100, f_train_Adagrad * 100 ]) train_Fwin = 'F1-score of Optimizers{train}' vis.line(np.array(train_F), X=np.array([i for i in range(len(train_F))]), win=train_Fwin, opts={ 'title': train_Fwin, 'legend': ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad'] }) speed, acc, p, r, f_dev_SGD, _ = evaluate(data, model_SGD, 'dev') speed, acc, p, r, f_dev_Adam, _ = evaluate(data, model_Adam, 'dev') speed, acc, p, r, f_dev_RMSprop, _ = evaluate(data, model_RMSprop, 'dev') speed, acc, p, r, f_dev_Adadelta, _ = evaluate(data, model_Adadelta, 'dev') speed, acc, p, r, f_dev_Adagrad, _ = evaluate(data, model_Adagrad, 'dev') dev_F.append([ f_dev_SGD * 100, f_dev_Adam * 100, f_dev_RMSprop * 100, f_dev_Adadelta * 100, f_dev_Adagrad * 100 ]) dev_Fwin = 'F1-score of Optimizers{dev}' vis.line(np.array(dev_F), X=np.array([i for i in range(len(dev_F))]), win=dev_Fwin, opts={ 'title': dev_Fwin, 'legend': ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad'] }) speed, acc, p, r, f_test_SGD, _ = evaluate(data, model_SGD, 'test') speed, acc, p, r, f_test_Adam, _ = evaluate(data, model_Adam, 'test') speed, acc, p, r, f_test_RMSprop, _ = evaluate(data, model_RMSprop, 'test') speed, acc, p, r, f_test_Adadelta, _ = evaluate( data, model_Adadelta, 'test') speed, acc, p, r, f_test_Adagrad, _ = evaluate(data, model_Adagrad, 'test') test_F.append([ f_test_SGD * 100, f_test_Adam * 100, f_test_RMSprop * 100, f_test_Adadelta * 100, f_test_Adagrad * 100 ]) test_Fwin = 'F1-score of Optimizers{test}' vis.line(np.array(test_F), X=np.array([i for i in range(len(test_F))]), win=test_Fwin, opts={ 'title': test_Fwin, 'legend': ['SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad'] }) gc.collect()
margin = 4 use_rank_weight = True lr = 0.2 user_negs_n = 5000 n_negative = 10 topk = 5 train1_pd, test1_pd, test2_pd, test3_pd, test4_pd, most_popular_items, n_users, n_items = movielens( 'datasets/ml/ratings.csv') n_users = int(n_users) n_items = int(n_items) network = model_R.KVMRN(dim=dim, n_users=n_users, n_items=n_items, memory_size=memory_size) network = network.cuda() optimizer = optim.Adagrad(network.parameters(), lr=lr) # valid_users = valid_pd['user'].sample(1000).values test_pds = [test1_pd, test2_pd, test3_pd, test4_pd] # test_pds = [test1_pd] train_pd = train1_pd previous_test_pd = train1_pd for test_part, test_pd in enumerate(test_pds): train_users = train_pd['user'].values train_items = train_pd['item'].values all_users_in_train = set(list(train_users)) all_items_in_train = set(list(train_items)) user_to_train_set = dict() user_to_test_set = dict()
def __init__(self, input_size, bert_input_size, inference_type="zeroshot", num_topics=10, model_type='prodLDA', hidden_sizes=(100, 100), activation='softplus', dropout=0.2, learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99, solver='adam', num_epochs=100, num_samples=10, reduce_on_plateau=False, topic_prior_mean=0.0, topic_prior_variance=None, num_data_loader_workers=0): """ :param input_size: int, dimension of input :param bert_input_size: int, dimension of input that comes from BERT embeddings :param inference_type: string, you can choose between the contextual model and the combined model :param num_topics: int, number of topic components, (default 10) :param model_type: string, 'prodLDA' or 'LDA' (default 'prodLDA') :param hidden_sizes: tuple, length = n_layers, (default (100, 100)) :param activation: string, 'softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu' (default 'softplus') :param dropout: float, dropout to use (default 0.2) :param learn_priors: bool, make priors a learnable parameter (default True) :param batch_size: int, size of batch to use for training (default 64) :param lr: float, learning rate to use for training (default 2e-3) :param momentum: float, momentum to use for training (default 0.99) :param solver: string, optimizer 'adam' or 'sgd' (default 'adam') :param num_samples: int, number of times theta needs to be sampled :param num_epochs: int, number of epochs to train for, (default 100) :param reduce_on_plateau: bool, reduce learning rate by 10x on plateau of 10 epochs (default False) :param num_data_loader_workers: int, number of data loader workers (default cpu_count). set it to 0 if you are using Windows """ assert isinstance(input_size, int) and input_size > 0, \ "input_size must by type int > 0." assert isinstance(num_topics, int) and input_size > 0, \ "num_topics must by type int > 0." assert model_type in ['LDA', 'prodLDA'], \ "model must be 'LDA' or 'prodLDA'." assert isinstance(hidden_sizes, tuple), \ "hidden_sizes must be type tuple." assert activation in ['softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 'selu'], \ "activation must be 'softplus', 'relu', 'sigmoid', 'swish', 'leakyrelu'," \ " 'rrelu', 'elu', 'selu' or 'tanh'." assert dropout >= 0, "dropout must be >= 0." # assert isinstance(learn_priors, bool), "learn_priors must be boolean." assert isinstance(batch_size, int) and batch_size > 0, \ "batch_size must be int > 0." assert lr > 0, "lr must be > 0." assert isinstance(momentum, float) and momentum > 0 and momentum <= 1, \ "momentum must be 0 < float <= 1." assert solver in ['adagrad', 'adam', 'sgd', 'adadelta', 'rmsprop'], \ "solver must be 'adam', 'adadelta', 'sgd', 'rmsprop' or 'adagrad'" assert isinstance(reduce_on_plateau, bool), \ "reduce_on_plateau must be type bool." assert isinstance(topic_prior_mean, float), \ "topic_prior_mean must be type float" # and topic_prior_variance >= 0, \ # assert isinstance(topic_prior_variance, float), \ # "topic prior_variance must be type float" self.input_size = input_size self.num_topics = num_topics self.model_type = model_type self.hidden_sizes = hidden_sizes self.activation = activation self.dropout = dropout self.learn_priors = learn_priors self.batch_size = batch_size self.lr = lr self.num_samples = num_samples self.bert_size = bert_input_size self.momentum = momentum self.solver = solver self.num_epochs = num_epochs self.reduce_on_plateau = reduce_on_plateau self.num_data_loader_workers = num_data_loader_workers self.topic_prior_mean = topic_prior_mean self.topic_prior_variance = topic_prior_variance # init inference avitm network self.model = DecoderNetwork(input_size, self.bert_size, inference_type, num_topics, model_type, hidden_sizes, activation, dropout, self.learn_priors, self.topic_prior_mean, self.topic_prior_variance) self.early_stopping = EarlyStopping(patience=5, verbose=False) # init optimizer if self.solver == 'adam': self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(self.momentum, 0.99)) elif self.solver == 'sgd': self.optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=self.momentum) elif self.solver == 'adagrad': self.optimizer = optim.Adagrad(self.model.parameters(), lr=lr) elif self.solver == 'adadelta': self.optimizer = optim.Adadelta(self.model.parameters(), lr=lr) elif self.solver == 'rmsprop': self.optimizer = optim.RMSprop(self.model.parameters(), lr=lr, momentum=self.momentum) # init lr scheduler if self.reduce_on_plateau: self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10) # performance attributes self.best_loss_train = float('inf') # training attributes self.model_dir = None self.train_data = None self.nn_epoch = None # learned topics self.best_components = None # Use cuda if available if torch.cuda.is_available(): self.USE_CUDA = True else: self.USE_CUDA = False if self.USE_CUDA: self.model = self.model.cuda()
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter( "[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname) + '.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') utils.build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = ABCNN(vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args.freeze_embed) criterion = nn.KLDivLoss() # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file, map_location='cpu') #改 else: # load glove embeddings and vocab glove_vocab, glove_emb = utils.load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=device) emb.normal_(0, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model model.emb.weight.data.copy_(emb) model.to(device), criterion.to(device) if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer, device) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info( '==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info( '==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info( '==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format( epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug( '==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
def configure_optimizers(self): # Adagrad creates state tensors immediately, model is not yet on GPU. return optim.Adagrad(self.parameters())
def main(): import argparse fmt_class = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser(formatter_class=fmt_class) group = parser.add_argument_group('Data') group.add_argument('dataset', choices=sorted(DATASETS.keys()), help='dataset to be used') parser.add_argument('--no-cuda', action='store_true', help='disables CUDA training') parser.add_argument('--seed', type=int, default=0, help='random seed') group = parser.add_argument_group('Semantic Loss') parser.add_argument('-a', '--alpha', type=float, default=1.0, help='trade-off between losses') group = parser.add_argument_group('Neural Net') parser.add_argument('--n-epochs', type=int, default=100, help='number of epochs to train') parser.add_argument('--batch-size', type=int, default=64, help='batch size for training and evaluation') parser.add_argument('--lr', type=float, default=1.0, help='learning rate') parser.add_argument('--gamma', type=float, default=0.7, help='Learning rate step gamma') args = parser.parse_args() np.random.seed(args.seed) torch.manual_seed(args.seed) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') # Load the data dataset = DATASETS[args.dataset]() indices = list(range(dataset.data.shape[0])) tr, ts = train_test_split(indices, test_size=0.2) tr_loader, ts_loader = nndt.dataset_to_loaders(dataset, tr, ts, device, batch_size=args.batch_size) # Build the semantic loss sl = nndt.DecisionTreeLoss(dataset).fit(dataset.data[tr], dataset.target[tr]) sl.sync() # Build the neural net n_inputs = dataset.data.shape[1] net = nndt.FeedForwardNetwork(dataset, n_inputs).to(device) # Evaluate the NN+DT combo optimizer = optim.Adagrad(net.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.n_epochs + 1): nndt.train(net, device, tr_loader, optimizer, sl, args.alpha) label_loss, distillation_loss, n_correct = nndt.test( net, device, ts_loader, sl) print( f'{epoch} : ll={label_loss:5.3f} dl={distillation_loss:5.3f} acc={n_correct}' ) scheduler.step()
def train(): print("start") cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') cmd.add_argument('--seed', default=1, type=int, help='The random seed.') cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') cmd.add_argument('--cont', default=1, type=int, help='if continue 1 load previous model' ) ###################불러올려면 여기 저장 cmd.add_argument('--train_path', required=True, help='The path to the training file.') cmd.add_argument('--valid_path', help='The path to the development file.') cmd.add_argument('--test_path', help='The path to the testing file.') cmd.add_argument('--config_path', required=True, help='the path to the config file.') cmd.add_argument("--word_embedding", help="The path to word vectors.") cmd.add_argument( '--optimizer', default='sgd', choices=['sgd', 'adam', 'adagrad'], help='the type of optimizer: valid options=[sgd, adam, adagrad]') cmd.add_argument("--lr", type=float, default=0.01, help='the learning rate.') cmd.add_argument("--lr_decay", type=float, default=0, help='the learning rate decay.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=128, help='the batch size.') cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') cmd.add_argument("--clip_grad", type=float, default=5, help='the tense of clipped grad.') cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') cmd.add_argument('--min_count', type=int, default=5, help='minimum word count.') cmd.add_argument('--max_vocab_size', type=int, default=150000, help='maximum vocabulary size.') cmd.add_argument('--save_classify_layer', default=True, action='store_true', help="whether to save the classify layer") cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") cmd.add_argument('--eval_steps', required=False, type=int, help='report every xx batches.') print("argment 들어감") opt = cmd.parse_args(sys.argv[2:]) with open(opt.config_path, 'r') as fin: config = json.load(fin) # Dump configurations print(opt) print(config) # set seed. torch.manual_seed(opt.seed) random.seed(opt.seed) if opt.gpu >= 0: torch.cuda.set_device(opt.gpu) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) print("Gpu 셋팅 끝") use_cuda = opt.gpu >= 0 and torch.cuda.is_available() token_embedder_name = config['token_embedder']['name'].lower() token_embedder_max_chars = config['token_embedder'].get( 'max_characters_per_token', None) ''' if token_embedder_name == 'cnn': train_data = read_corpus_yield(opt.train_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': train_data = read_corpus(opt.train_path, opt.max_sent_len) else: raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name)) ''' #logging.info('training instance: {}, training tokens: {}.'.format(len(train_data), # sum([len(s) - 1 for s in train_data]))) print("read corpus 끝") if opt.valid_path is not None: if token_embedder_name == 'cnn': valid_data = read_corpus(opt.valid_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': valid_data = read_corpus(opt.valid_path, opt.max_sent_len) else: raise ValueError( 'Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('valid instance: {}, valid tokens: {}.'.format( len(valid_data), sum([len(s) - 1 for s in valid_data]))) elif opt.valid_size > 0: train_data, valid_data = divide(train_data, opt.valid_size) logging.info( 'training instance: {}, training tokens after division: {}.'. format(len(train_data), sum([len(s) - 1 for s in train_data]))) logging.info('valid instance: {}, valid tokens: {}.'.format( len(valid_data), sum([len(s) - 1 for s in valid_data]))) else: valid_data = None if opt.test_path is not None: if token_embedder_name == 'cnn': test_data = read_corpus(opt.test_path, token_embedder_max_chars, opt.max_sent_len) elif token_embedder_name == 'lstm': test_data = read_corpus(opt.test_path, opt.max_sent_len) else: raise ValueError( 'Unknown token embedder name: {}'.format(token_embedder_name)) logging.info('testing instance: {}, testing tokens: {}.'.format( len(test_data), sum([len(s) - 1 for s in test_data]))) else: test_data = None print("word임베딩 시작") if opt.word_embedding is not None: print("dhfdhkfdhkdhdfkdhgkssghksghgsk") embs = load_embedding(opt.word_embedding) word_lexicon = {word: i for i, word in enumerate(embs[0])} else: embs = None word_lexicon = {} ''' # Maintain the vocabulary. vocabulary is used in either WordEmbeddingInput or softmax classification vocab = get_truncated_vocab(train_data, opt.min_count) # Ensure index of '<oov>' is 0 for special_word in ['<oov>', '<bos>', '<eos>', '<pad>']: if special_word not in word_lexicon: word_lexicon[special_word] = len(word_lexicon) for word, _ in vocab: if word not in word_lexicon: word_lexicon[word] = len(word_lexicon) # Word Embedding if config['token_embedder']['word_dim'] > 0: word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=embs) logging.info('Word embedding size: {0}'.format(len(word_emb_layer.word2id))) else: word_emb_layer = None logging.info('Vocabulary size: {0}'.format(len(word_lexicon))) print("word임베딩 끝 캐릭터 시작") # Character Lexicon if config['token_embedder']['char_dim'] > 0: char_lexicon = {} for sentence in train_data: for word in sentence: for ch in word: if ch not in char_lexicon: char_lexicon[ch] = len(char_lexicon) for special_char in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']: if special_char not in char_lexicon: char_lexicon[special_char] = len(char_lexicon) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False) logging.info('Char embedding size: {0}'.format(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None ''' ''' 여기 바꿔줬다 그냥 로딩하는걸로..''' if config['token_embedder']['char_dim'] > 0: char_lexicon = {} with codecs.open(os.path.join(opt.model, 'char.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) char_emb_layer = EmbeddingLayer(config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None) logging.info('char embedding size: ' + str(len(char_emb_layer.word2id))) else: char_lexicon = None char_emb_layer = None # For the model trained with word form word encoder. if config['token_embedder']['word_dim'] > 0: word_lexicon = {} with codecs.open(os.path.join(opt.model, 'word.dic'), 'r', encoding='utf-8') as fpi: for line in fpi: tokens = line.strip().split('\t') if len(tokens) == 1: tokens.insert(0, '\u3000') token, i = tokens word_lexicon[token] = int(i) word_emb_layer = EmbeddingLayer(config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None) logging.info('word embedding size: ' + str(len(word_emb_layer.word2id))) else: word_lexicon = None word_emb_layer = None print("캐릭터 임베딩 끝 배치 시작") #train = create_batches(train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) if opt.eval_steps is None: #opt.eval_steps = len(train[0]) opt.eval_steps = 4096 #len(train_data)/opt.batch_size logging.info('Evaluate every {0} batches.'.format(opt.eval_steps)) if valid_data is not None: valid = create_batches(valid_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: valid = None if test_data is not None: test = create_batches(test_data, opt.batch_size, word_lexicon, char_lexicon, config, sort=False, shuffle=False, use_cuda=use_cuda) else: test = None label_to_ix = word_lexicon logging.info('vocab size: {0}'.format(len(label_to_ix))) nclasses = len(label_to_ix) print("모델 만들자고 친구") model = Model(config, word_emb_layer, char_emb_layer, nclasses, use_cuda) logging.info(str(model)) if use_cuda: model = model.cuda() if opt.cont == 1: print("모델 로드 했다!!") model.load_model(opt.model) print("옵티마이저 설정") need_grad = lambda x: x.requires_grad if opt.optimizer.lower() == 'adam': optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'sgd': optimizer = optim.SGD(filter(need_grad, model.parameters()), lr=opt.lr) elif opt.optimizer.lower() == 'adagrad': optimizer = optim.Adagrad(filter(need_grad, model.parameters()), lr=opt.lr) else: raise ValueError('Unknown optimizer {}'.format(opt.optimizer.lower())) print("디렉토리 만들자") try: os.makedirs(opt.model) except OSError as exception: if exception.errno != errno.EEXIST: raise print("딕셔너리 만들자구 친구") print(opt.cont) print(opt.model) """ if config['token_embedder']['char_dim'] > 0: with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: for ch, i in char_emb_layer.word2id.items(): print('{0}\t{1}'.format(ch, i), file=fpo) with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: for w, i in word_lexicon.items(): print('{0}\t{1}'.format(w, i), file=fpo) """ json.dump( vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) best_train = 1e+8 best_valid = 1e+8 test_result = 1e+8 print("드디어 학습시작 시작") for epoch in range(opt.max_epoch): train_data = read_corpus_yield(opt.train_path, token_embedder_max_chars, opt.max_sent_len) train = create_batches(train_data, opt.batch_size, word_lexicon, char_lexicon, config, use_cuda=use_cuda) best_train, best_valid, test_result = train_model( epoch, opt, model, optimizer, train, valid, test, best_train, best_valid, test_result) if opt.lr_decay > 0: optimizer.param_groups[0]['lr'] *= opt.lr_decay if valid_data is None: logging.info("best train ppl: {:.6f}.".format(best_train)) elif test_data is None: logging.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format( best_train, best_valid)) else: logging.info( "best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}." .format(best_train, best_valid, test_result))
def trainEpochs(decoder, n_epochs, print_every=1000, plot_every=100, learning_rate=0.01, total_batch=100, batch_size=1, penalty=(1, 0.5), gamma=0.1): start = time.time() plot_losses = [] print_loss_total = 0 plot_loss_total = 0 decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) #criterion = nn.CrossEntropyLoss() criterion = nn.MSELoss() scheduler = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=2, gamma=gamma) iter = 0 for epoch in range(1, n_epochs + 1): start, end = 0, batch_size if epoch > 5: decoder_optimizer = optim.Adagrad(decoder.parameters(), lr=learning_rate) #verbose = (iter % print_every == 0) while end <= total_batch: iter += 1 target_tensor = torch.from_numpy(np.array( train_Y[start:end][:])).to(device).float() input_tensor = torch.from_numpy(np.array( train_X[start:end][:])).to(device).float() #target_tensor = torch.from_numpy(np.array(train_Y[num])).to(device).float() #input_tensor = Variable(input_tensor, requires_grad=True) #print(input_tensor.shape, target_tensor.shape, decoder) #print(decoder_optimizer, criterion) loss, decoder_output = train(input_tensor, target_tensor, decoder, decoder_optimizer, criterion) print_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every #print(decoder_output.view(-1).detach().cpu().numpy()) #print(target_tensor) #print(decoder_optimizer) print( "loss%i/%i:" % (iter, n_epochs * (total_batch // batch_size)), print_loss_avg) print_loss_total = 0 #training_progress = validation(decoder, train_X, train_Y) training_progress = (decoder_output.view( batch_size, -1).cpu().detach().numpy(), train_Y[start:end]) f = open( '/home/yixing/Fischer/DeepPerformance/Bi-LSTM-CNN_batch_progress.pkl', "wb") pickle.dump(training_progress, f) f.close() torch.save( decoder.state_dict(), '/home/yixing/Fischer/DeepPerformance/Bi-LSTM-CNN_batch_7L1.pt' ) start += batch_size end += batch_size scheduler.step()
for param in net.cnn.parameters(): param.requires_grad = True # fc_params = list(map(id, net.cnn.fc.parameters())) # base_params = list(filter(lambda p: id(p) not in fc_params, net.cnn.parameters())) # optimizer = optim.Adagrad([{'params': base_params}, # {'params': net.cnn.fc.parameters(), 'lr': 0.005} # ], lr=0.0005, weight_decay=0.005) # start_epoch = 0 # optimizer = optim.Adam(net.cnn.fc.parameters(), weight_decay=0.0005) # optimizer = torch.optim.SGD([ # {'params': base_params}, # {'params': net.cnn.fc.parameters(), 'lr': 1} # ], lr=1e-4, momentum=0.9, weight_decay=0.0005) from zeroshot.cub_test import zsl_test, gzsl_test import copy optimizer = optim.Adagrad(net.cnn.parameters(), lr=0.001, weight_decay=0.005) for epoch in range(start_epoch, 500): train(epoch, net, optimizer) test(epoch, net) if epoch > 10: net1 = copy.deepcopy(net) zsl_test(epoch, net1, optimizer) del net1 # net2 = copy.deepcopy(net) # gzsl_test(epoch, net2, optimizer) # del net2 log.close()
class Classifier(nn.Module): def __init__(self): super(Classifier, self).__init__() self.FC = torch.nn.Sequential( nn.Linear(Z_in, 1), nn.Dropout(rate), nn.Sigmoid()) def forward(self, x): return self.FC(x) torch.cuda.manual_seed_all(42) AutoencoderE = AEE() solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE) Clas = Classifier() SolverClass = optim.Adagrad(Clas.parameters(), lr=lrCL, weight_decay = wd) C_loss = torch.nn.BCELoss() for it in range(epoch): epoch_cost4 = 0 epoch_cost3 = [] num_minibatches = int(n_sampE / mb_size) for i, (dataE, target) in enumerate(trainLoader): flag = 0 AutoencoderE.train()
def test_learning_v3(): embedding_size = 10 batch_size = 16 triples, hops = [], [] for i in range(16): triples += [(f'a{i}', 'p', f'b{i}'), (f'b{i}', 'q', f'c{i}')] hops += [(f'a{i}', 'r', f'c{i}')] entity_lst = sorted({e for (e, _, _) in triples + hops} | {e for (e, _, e) in triples + hops}) predicate_lst = sorted({p for (_, p, _) in triples + hops}) nb_entities, nb_predicates = len(entity_lst), len(predicate_lst) entity_to_index = {e: i for i, e in enumerate(entity_lst)} predicate_to_index = {p: i for i, p in enumerate(predicate_lst)} torch.manual_seed(0) kernel = GaussianKernel() entity_embeddings = nn.Embedding(nb_entities, embedding_size * 2, sparse=True) predicate_embeddings = nn.Embedding(nb_predicates, embedding_size * 2, sparse=True) fact_rel = torch.from_numpy( np.array([predicate_to_index[p] for (_, p, _) in triples])) fact_arg1 = torch.from_numpy( np.array([entity_to_index[s] for (s, _, _) in triples])) fact_arg2 = torch.from_numpy( np.array([entity_to_index[o] for (_, _, o) in triples])) facts = [fact_rel, fact_arg1, fact_arg2] model = NeuralKB(entity_embeddings=entity_embeddings, predicate_embeddings=predicate_embeddings, kernel=kernel, facts=facts) reformulator = AttentiveReformulator(2, predicate_embeddings) hoppy = SimpleHoppy(model, entity_embeddings, hops=reformulator) N3_reg = N3() params = [ p for p in hoppy.parameters() if not torch.equal(p, entity_embeddings.weight) and not torch.equal(p, predicate_embeddings.weight) ] loss_function = nn.CrossEntropyLoss(reduction='mean') p_emb = predicate_embeddings( torch.from_numpy(np.array([predicate_to_index['p']]))) q_emb = predicate_embeddings( torch.from_numpy(np.array([predicate_to_index['q']]))) # r_emb = predicate_embeddings(torch.from_numpy(np.array([predicate_to_index['r']]))) optimizer = optim.Adagrad(params, lr=0.1) hops_data = [] for i in range(128): hops_data += hops batches = make_batches(len(hops_data), batch_size) c, d = 0.0, 0.0 for batch_start, batch_end in batches: hops_batch = hops_data[batch_start:batch_end] s_lst = [s for (s, _, _) in hops_batch] p_lst = [p for (_, p, _) in hops_batch] o_lst = [o for (_, _, o) in hops_batch] xs_np = np.array([entity_to_index[s] for s in s_lst]) xp_np = np.array([predicate_to_index[p] for p in p_lst]) xo_np = np.array([entity_to_index[o] for o in o_lst]) xs = torch.from_numpy(xs_np) xp = torch.from_numpy(xp_np) xo = torch.from_numpy(xo_np) xs_emb = entity_embeddings(xs) xp_emb = predicate_embeddings(xp) xo_emb = entity_embeddings(xo) sp_scores, po_scores = hoppy.forward(xp_emb, xs_emb, xo_emb) loss = loss_function(sp_scores, xo) + loss_function(po_scores, xs) factors = [hoppy.factor(e) for e in [xp_emb, xs_emb, xo_emb]] loss += 0.1 * N3_reg(factors) tmp = hoppy.hops(xp_emb) hop_1_emb = tmp[0] hop_2_emb = tmp[1] c = kernel.pairwise(p_emb, hop_1_emb).mean().cpu().detach().numpy() d = kernel.pairwise(q_emb, hop_2_emb).mean().cpu().detach().numpy() loss.backward() optimizer.step() optimizer.zero_grad() assert c > 0.95 assert d > 0.95
def main(args): np.random.seed(args.seed) th.manual_seed(args.seed) th.cuda.manual_seed(args.seed) best_epoch = -1 best_dev_acc = 0 cuda = args.gpu >= 0 device = th.device('cuda:{}'.format( args.gpu)) if cuda else th.device('cpu') if cuda: th.cuda.set_device(args.gpu) trainset = SST() train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, collate_fn=batcher(device), shuffle=True, num_workers=0) devset = SST(mode='dev') dev_loader = DataLoader(dataset=devset, batch_size=100, collate_fn=batcher(device), shuffle=False, num_workers=0) testset = SST(mode='test') test_loader = DataLoader(dataset=testset, batch_size=100, collate_fn=batcher(device), shuffle=False, num_workers=0) model = TreeLSTM(trainset.num_vocabs, args.x_size, args.h_size, trainset.num_classes, args.dropout, cell_type='childsum' if args.child_sum else 'nary', pretrained_emb=trainset.pretrained_emb).to(device) print(model) params_ex_emb = [ x for x in list(model.parameters()) if x.requires_grad and x.size(0) != trainset.num_vocabs ] params_emb = list(model.embedding.parameters()) for p in params_ex_emb: if p.dim() > 1: INIT.xavier_uniform_(p) optimizer = optim.Adagrad([{ 'params': params_ex_emb, 'lr': args.lr, 'weight_decay': args.weight_decay }, { 'params': params_emb, 'lr': 0.1 * args.lr }]) dur = [] for epoch in range(args.epochs): t_epoch = time.time() model.train() for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) if step >= 3: t0 = time.time() # tik logits = model(batch, h, c) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, batch.label, reduction='sum') optimizer.zero_grad() loss.backward() optimizer.step() if step >= 3: dur.append(time.time() - t0) # tok if step > 0 and step % args.log_every == 0: pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) print( "Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} | Root Acc {:.4f} | Time(s) {:.4f}" .format(epoch, step, loss.item(), 1.0 * acc.item() / len(batch.label), 1.0 * root_acc / len(root_ids), np.mean(dur))) print('Epoch {:05d} training time {:.4f}s'.format( epoch, time.time() - t_epoch)) # eval on dev set accs = [] root_accs = [] model.eval() for step, batch in enumerate(dev_loader): g = batch.graph n = g.number_of_nodes() with th.no_grad(): h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)).item() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) dev_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) dev_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print("Epoch {:05d} | Dev Acc {:.4f} | Root Acc {:.4f}".format( epoch, dev_acc, dev_root_acc)) if dev_root_acc > best_dev_acc: best_dev_acc = dev_root_acc best_epoch = epoch th.save(model.state_dict(), 'best_{}.pkl'.format(args.seed)) else: if best_epoch <= epoch - 10: break # lr decay for param_group in optimizer.param_groups: param_group['lr'] = max(1e-5, param_group['lr'] * 0.99) #10 print(param_group['lr']) # test model.load_state_dict(th.load('best_{}.pkl'.format(args.seed))) accs = [] root_accs = [] model.eval() for step, batch in enumerate(test_loader): g = batch.graph n = g.number_of_nodes() with th.no_grad(): h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)).item() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) test_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) test_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print( '------------------------------------------------------------------------------------' ) print("Epoch {:05d} | Test Acc {:.4f} | Root Acc {:.4f}".format( best_epoch, test_acc, test_root_acc))
super(FirstNet, self).__init__() self.size = size self.fc0 = nn.Linear(size, 20) self.fc1 = nn.Linear(20, 3) def forward(self, x): x = x.view(-1, self.size) x = F.relu(self.fc0(x)) x = self.fc1(x) #x = F.relu(self.fc2(x)) return F.log_softmax(x, dim=1) model = FirstNet(X.shape[1]) print(model) optimizer = optim.Adagrad(model.parameters(), lr=0.3) def train(epoch, model): model.train() t_loss = 0 correct = 0 for batch_idx, (data, labels) in enumerate(train_loader): optimizer.zero_grad() output = model(data) labels = labels.long() loss = F.nll_loss(output, labels) loss.backward() optimizer.step() pred = output.data.max( 1, keepdim=True)[1] # get the index of the max log-probability
epoch1 = 6 # optimizer = optim.Adagrad(optim_params, lr=0.001, weight_decay=0.005) optimizer = optim.Adam(optim_params, weight_decay=0.005) if start_epoch < epoch1: for epoch in range(start_epoch, epoch1): train(epoch, net, optimizer) test(epoch, net) start_epoch = epoch1 fc_params = list(map(id, net.fc2.parameters())) base_params = list(filter(lambda p: id(p) not in fc_params, net.parameters())) for param in base_params: param.requires_grad = True optimizer = optim.Adagrad(base_params, lr=0.001, weight_decay=0.005) from zeroshot.awa2_test import zsl_test import copy for epoch in range(start_epoch, 100): train(epoch, net, optimizer) test(epoch, net) if epoch > 6: net1 = copy.deepcopy(net) zsl_test(epoch, net1, optimizer) del net1 # net2 = copy.deepcopy(net) # gzsl_test(epoch, net2, optimizer) # del net2 log.close()