def prepare(self, char2id): send_output("\n>> Started preparing {} dataset".format(self.name), 1) self.train_input, self.train_target = self.__prepare_data( self.train_data, char2id) self.val_input, self.val_target = self.__prepare_data( self.val_data, char2id) self.test_input, self.test_target = self.__prepare_data( self.test_data, char2id) send_output("<< Finished preparing {} dataset".format(self.name), 1) del self.train_data, self.val_data, self.test_data
def extract_chars(self): send_output( ">>> Started extracting chars from {} dataset".format(self.name), 1) ret = { c for sample in self.train_data for token in sample for c in token[0] } send_output( "<<< Finished extracting chars from {} dataset".format(self.name), 1) return ret
def accuracy(device, model, datasets): name2dataset = {d.name: d for d in datasets} for d in datasets: d.class_correct = 0 d.class_total = 0 model.eval() for itr in get_batches(datasets, "test"): # Getting vars inputs, targets, dataset_name = itr # Setting the input and the target (seding to GPU if needed) inputs = [[word.to(device) for word in sample] for sample in inputs] targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True).to(device) # Feeding the model output = model(inputs) # convert output probabilities to predicted class _, pred = torch.max(output[dataset_name], 2) # Formatando vetor pred = pred.view(1, -1) # calculate test accuracy for each object class for ii in range(output["length"]): if ii >= len(targets[0]): break if targets.data[0][ii].item() <= 1: continue label, predicted = targets.data[0][ii], pred.data[0][ii] name2dataset[ dataset_name].class_correct += 1 if label == predicted else 0 name2dataset[dataset_name].class_total += 1 soma_correct = np.sum([d.class_correct for d in datasets]) soma_total = np.sum([d.class_total for d in datasets]) accuracy_ = 100. * soma_correct / soma_total out_str = '\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % ( accuracy_, soma_correct, soma_total) send_output(out_str, 0) for d in datasets: accuracy_d = 100. * d.class_correct / d.class_total out_str = '\nTest Accuracy (on {} Dataset): {:.2f}% ({}/{})'.format( d.name, accuracy_d, d.class_correct, d.class_total) send_output(out_str, 0)
def build_char_dict(datasets): send_output("\n>> Building char dict...", 1) extracted_chars = set() for dataset in datasets: extracted_chars = extracted_chars.union(dataset.extract_chars()) chars = [' ', 'UNK'] + list(sorted(extracted_chars)) # Criando estruturas do vocabulário char2id = {char: index for index, char in enumerate(chars)} id2char = [char for char, _ in char2id.items()] send_output("<< Finished building dicts!", 1) return char2id, id2char
def extract_tag_dict(self): send_output(">>> Started building tag dict for dataset", 1) extracted_tags = { token[1] for sample in self.train_data for token in sample } tags = list(sorted(extracted_tags)) self.tag2id = {"BOS": 0, "EOS": 1} for tag in tags: if tag not in self.tag2id: self.tag2id[tag] = len(self.tag2id) # Criando dicionario para as tags self.id2tag = [tag for tag, _ in self.tag2id.items()] send_output("<<< Finished building tag dict for dataset", 1)
def __init__(self, path_to_files, dataset_name, use_delimiters=True, use_train=True, use_val=True): self.name = dataset_name send_output("\n>> Initializing {} dataset".format(self.name), 1) # Loading to each dataset subset send_output(">>> Started loading dataset", 1) self.train_data = self.__load_data(path_to_files[0]) self.val_data = self.__load_data(path_to_files[1]) self.test_data = self.__load_data(path_to_files[2]) send_output("<<< Finished loading dataset", 1) # Parsing send_output(">>> Started parsing data from dataset", 1) self.train_data, self.word_train_size = self.__parse_data( self.train_data, use_delimiters) self.val_data, self.word_val_size = self.__parse_data( self.val_data, use_delimiters) self.test_data, self.word_test_size = self.__parse_data( self.test_data, use_delimiters) send_output("<<< Finished parsing data from dataset", 1) # Setting bool flags self.use_train = use_train self.use_val = use_val # Setup tag dicts self.extract_tag_dict() # Train, val and test data size self.sent_train_size = len(self.train_data[0]) self.sent_val_size = len(self.val_data[0]) self.sent_test_size = len(self.test_data[0]) # Setting training and val loss self.train_loss = 0.0 self.val_loss = 0.0 # Setting test counters self.class_correct = [0 for _ in range(len(self.tag2id))] self.class_total = [0 for _ in range(len(self.tag2id))] send_output("<< Finished initializing {} dataset".format(self.name), 1)
def train(device, model, datasets, min_val_loss=np.inf): # optimizer and loss function optimizer = torch.optim.Adadelta(model.parameters()) criterion = torch.nn.CrossEntropyLoss() name2dataset = {d.name: d for d in datasets} for epoch in range(EPOCHS): inicio = time.time() for d in datasets: d.train_loss = d.val_loss = 0.0 model.train() for itr in get_batches(datasets, "train", BATCH_SIZE, "visconde"): # Getting vars inputs, targets, dataset_name = itr # Setting the input and the target (seding to GPU if needed) inputs = [[word.to(device) for word in sample] for sample in inputs] targets = torch.nn.utils.rnn.pad_sequence( targets, batch_first=True).to(device) # Feeding the model output = model(inputs) # Reseting the gradients optimizer.zero_grad() # Calculating the loss and the gradients loss = criterion( output[dataset_name].view(BATCH_SIZE * output["length"], -1), targets.view(BATCH_SIZE * output["length"])) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIPPING) # Adjusting the weights optimizer.step() # Updating the train loss name2dataset[dataset_name].train_loss += loss.item() * BATCH_SIZE model.eval() for itr in get_batches(datasets, "val"): # Getting vars inputs, targets, dataset_name = itr # Setting the input and the target (seding to GPU if needed) inputs = [[word.to(device) for word in sample] for sample in inputs] targets = torch.nn.utils.rnn.pad_sequence( targets, batch_first=True).to(device) # Feeding the model output = model(inputs) # Calculating the loss and the gradients loss = criterion(output[dataset_name].view(output["length"], -1), targets.view(output["length"])) # Updating the loss accu name2dataset[dataset_name].val_loss += loss.item() # Normalizing the losses for i in range(len(datasets)): if datasets[i].use_train: datasets[i].train_loss /= datasets[i].sent_train_size if datasets[i].use_val: datasets[i].val_loss /= datasets[i].sent_val_size # Verbose out_str = "\n=======================================================================================" current_lr = optimizer.param_groups[0]['lr'] total_train_loss = sum([d.train_loss for d in datasets if d.use_train]) total_val_loss = sum([d.val_loss for d in datasets if d.use_val]) duration = time.time() - inicio out_str += ( "Epoch: {} \t Learning Rate: {:.3f}\tTotal Training Loss: {:.6f} \tTotal Validation Loss: {:.6f} \t Duration: {:.3f}\n" .format(epoch, current_lr, total_train_loss, total_val_loss, duration)) for d in datasets: if d.use_train and d.use_val: out_str += ( '>> Dataset {}:\tTraining Loss: {:.6f}\tValidation Loss:{:.6f}\n' .format(d.name, d.train_loss, d.val_loss)) elif d.use_train and not d.use_val: out_str += ('>> Dataset {}:\tTraining Loss: {:.6f}\n'.format( d.name, d.train_loss)) elif not d.use_train and d.use_val: out_str += ('>> Dataset {}:\tValidation Loss: {:.6f}\n'.format( d.name, d.val_loss)) out_str += ( "----------------------------------------------------------------------------------------\n" ) # Saving the best model out_str += ('Comparing loss on {} dataset(s)\n'.format( [d.name for d in datasets if d.use_val])) if total_val_loss <= min_val_loss: torch.save(model.state_dict(), STATE_DICT_PATH) out_str += ( 'Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...\n' .format(min_val_loss, total_val_loss)) min_val_loss = total_val_loss out_str += ( "=======================================================================================\n" ) send_output(out_str, 0) return model, min_val_loss
######################################################################################### ''' # dataset building datasets = load_datasets() # builds char-id table char2id, id2char = build_char_dict(datasets) # converts text to id from chars for dataset in datasets: dataset.prepare(char2id) # prints the datasets details for dataset in datasets: send_output(str(dataset), 1) ''' ######################################################################################### ######### ############ ######### DEFINING MODELS AND TRAINING ############ ######### ############ ######################################################################################### ''' # building model pos_model = POSTagger( CharBILSTM(CHAR_EMBEDDING_DIM, WORD_EMBEDDING_DIM, char2id), WordBILSTM(WORD_EMBEDDING_DIM), WordBILSTM(WORD_EMBEDDING_DIM), BILSTM_SIZE, datasets) pos_model.to(device)