Example #1
0
 def prepare(self, char2id):
     send_output("\n>> Started preparing {} dataset".format(self.name), 1)
     self.train_input, self.train_target = self.__prepare_data(
         self.train_data, char2id)
     self.val_input, self.val_target = self.__prepare_data(
         self.val_data, char2id)
     self.test_input, self.test_target = self.__prepare_data(
         self.test_data, char2id)
     send_output("<< Finished preparing {} dataset".format(self.name), 1)
     del self.train_data, self.val_data, self.test_data
Example #2
0
 def extract_chars(self):
     send_output(
         ">>> Started extracting chars from {} dataset".format(self.name),
         1)
     ret = {
         c
         for sample in self.train_data for token in sample for c in token[0]
     }
     send_output(
         "<<< Finished extracting chars from {} dataset".format(self.name),
         1)
     return ret
Example #3
0
def accuracy(device, model, datasets):
    name2dataset = {d.name: d for d in datasets}

    for d in datasets:
        d.class_correct = 0
        d.class_total = 0

    model.eval()
    for itr in get_batches(datasets, "test"):
        # Getting vars
        inputs, targets, dataset_name = itr

        # Setting the input and the target (seding to GPU if needed)
        inputs = [[word.to(device) for word in sample] for sample in inputs]

        targets = torch.nn.utils.rnn.pad_sequence(targets,
                                                  batch_first=True).to(device)

        # Feeding the model
        output = model(inputs)
        # convert output probabilities to predicted class
        _, pred = torch.max(output[dataset_name], 2)

        # Formatando vetor
        pred = pred.view(1, -1)

        # calculate test accuracy for each object class
        for ii in range(output["length"]):
            if ii >= len(targets[0]):
                break
            if targets.data[0][ii].item() <= 1:
                continue

            label, predicted = targets.data[0][ii], pred.data[0][ii]
            name2dataset[
                dataset_name].class_correct += 1 if label == predicted else 0
            name2dataset[dataset_name].class_total += 1

    soma_correct = np.sum([d.class_correct for d in datasets])
    soma_total = np.sum([d.class_total for d in datasets])
    accuracy_ = 100. * soma_correct / soma_total
    out_str = '\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
        accuracy_, soma_correct, soma_total)

    send_output(out_str, 0)

    for d in datasets:
        accuracy_d = 100. * d.class_correct / d.class_total
        out_str = '\nTest Accuracy (on {} Dataset): {:.2f}% ({}/{})'.format(
            d.name, accuracy_d, d.class_correct, d.class_total)
        send_output(out_str, 0)
Example #4
0
def build_char_dict(datasets):
    send_output("\n>> Building char dict...", 1)

    extracted_chars = set()
    for dataset in datasets:
        extracted_chars = extracted_chars.union(dataset.extract_chars())
    chars = [' ', 'UNK'] + list(sorted(extracted_chars))

    # Criando estruturas do vocabulário
    char2id = {char: index for index, char in enumerate(chars)}
    id2char = [char for char, _ in char2id.items()]

    send_output("<< Finished building dicts!", 1)
    return char2id, id2char
Example #5
0
    def extract_tag_dict(self):
        send_output(">>> Started building tag dict for dataset", 1)
        extracted_tags = {
            token[1]
            for sample in self.train_data for token in sample
        }
        tags = list(sorted(extracted_tags))

        self.tag2id = {"BOS": 0, "EOS": 1}
        for tag in tags:
            if tag not in self.tag2id:
                self.tag2id[tag] = len(self.tag2id)

        # Criando dicionario para as tags
        self.id2tag = [tag for tag, _ in self.tag2id.items()]

        send_output("<<< Finished building tag dict for dataset", 1)
Example #6
0
    def __init__(self,
                 path_to_files,
                 dataset_name,
                 use_delimiters=True,
                 use_train=True,
                 use_val=True):
        self.name = dataset_name

        send_output("\n>> Initializing {} dataset".format(self.name), 1)
        # Loading to each dataset subset
        send_output(">>> Started loading dataset", 1)
        self.train_data = self.__load_data(path_to_files[0])
        self.val_data = self.__load_data(path_to_files[1])
        self.test_data = self.__load_data(path_to_files[2])

        send_output("<<< Finished loading dataset", 1)

        # Parsing
        send_output(">>> Started parsing data from dataset", 1)
        self.train_data, self.word_train_size = self.__parse_data(
            self.train_data, use_delimiters)
        self.val_data, self.word_val_size = self.__parse_data(
            self.val_data, use_delimiters)
        self.test_data, self.word_test_size = self.__parse_data(
            self.test_data, use_delimiters)

        send_output("<<< Finished parsing data from dataset", 1)

        # Setting bool flags
        self.use_train = use_train
        self.use_val = use_val

        # Setup tag dicts
        self.extract_tag_dict()

        # Train, val and test data size
        self.sent_train_size = len(self.train_data[0])
        self.sent_val_size = len(self.val_data[0])
        self.sent_test_size = len(self.test_data[0])

        # Setting training and val loss
        self.train_loss = 0.0
        self.val_loss = 0.0

        # Setting test counters
        self.class_correct = [0 for _ in range(len(self.tag2id))]
        self.class_total = [0 for _ in range(len(self.tag2id))]

        send_output("<< Finished initializing {} dataset".format(self.name), 1)
Example #7
0
def train(device, model, datasets, min_val_loss=np.inf):

    # optimizer and loss function
    optimizer = torch.optim.Adadelta(model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    name2dataset = {d.name: d for d in datasets}

    for epoch in range(EPOCHS):
        inicio = time.time()

        for d in datasets:
            d.train_loss = d.val_loss = 0.0

        model.train()
        for itr in get_batches(datasets, "train", BATCH_SIZE, "visconde"):
            # Getting vars
            inputs, targets, dataset_name = itr

            # Setting the input and the target (seding to GPU if needed)
            inputs = [[word.to(device) for word in sample]
                      for sample in inputs]
            targets = torch.nn.utils.rnn.pad_sequence(
                targets, batch_first=True).to(device)

            # Feeding the model
            output = model(inputs)

            # Reseting the gradients
            optimizer.zero_grad()

            # Calculating the loss and the gradients
            loss = criterion(
                output[dataset_name].view(BATCH_SIZE * output["length"], -1),
                targets.view(BATCH_SIZE * output["length"]))
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           GRADIENT_CLIPPING)

            # Adjusting the weights
            optimizer.step()

            # Updating the train loss
            name2dataset[dataset_name].train_loss += loss.item() * BATCH_SIZE

        model.eval()
        for itr in get_batches(datasets, "val"):
            # Getting vars
            inputs, targets, dataset_name = itr

            # Setting the input and the target (seding to GPU if needed)
            inputs = [[word.to(device) for word in sample]
                      for sample in inputs]
            targets = torch.nn.utils.rnn.pad_sequence(
                targets, batch_first=True).to(device)

            # Feeding the model
            output = model(inputs)

            # Calculating the loss and the gradients
            loss = criterion(output[dataset_name].view(output["length"], -1),
                             targets.view(output["length"]))

            # Updating the loss accu
            name2dataset[dataset_name].val_loss += loss.item()

        # Normalizing the losses
        for i in range(len(datasets)):
            if datasets[i].use_train:
                datasets[i].train_loss /= datasets[i].sent_train_size
            if datasets[i].use_val:
                datasets[i].val_loss /= datasets[i].sent_val_size

        # Verbose
        out_str = "\n======================================================================================="
        current_lr = optimizer.param_groups[0]['lr']
        total_train_loss = sum([d.train_loss for d in datasets if d.use_train])
        total_val_loss = sum([d.val_loss for d in datasets if d.use_val])
        duration = time.time() - inicio
        out_str += (
            "Epoch: {} \t Learning Rate: {:.3f}\tTotal Training Loss: {:.6f} \tTotal Validation Loss: {:.6f} \t Duration: {:.3f}\n"
            .format(epoch, current_lr, total_train_loss, total_val_loss,
                    duration))

        for d in datasets:
            if d.use_train and d.use_val:
                out_str += (
                    '>> Dataset {}:\tTraining Loss: {:.6f}\tValidation Loss:{:.6f}\n'
                    .format(d.name, d.train_loss, d.val_loss))
            elif d.use_train and not d.use_val:
                out_str += ('>> Dataset {}:\tTraining Loss: {:.6f}\n'.format(
                    d.name, d.train_loss))
            elif not d.use_train and d.use_val:
                out_str += ('>> Dataset {}:\tValidation Loss: {:.6f}\n'.format(
                    d.name, d.val_loss))

        out_str += (
            "----------------------------------------------------------------------------------------\n"
        )

        # Saving the best model
        out_str += ('Comparing loss on {} dataset(s)\n'.format(
            [d.name for d in datasets if d.use_val]))

        if total_val_loss <= min_val_loss:
            torch.save(model.state_dict(), STATE_DICT_PATH)
            out_str += (
                'Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...\n'
                .format(min_val_loss, total_val_loss))
            min_val_loss = total_val_loss
        out_str += (
            "=======================================================================================\n"
        )

        send_output(out_str, 0)

    return model, min_val_loss
Example #8
0
#########################################################################################
'''

# dataset building
datasets = load_datasets()

# builds char-id table
char2id, id2char = build_char_dict(datasets)

# converts text to id from chars
for dataset in datasets:
    dataset.prepare(char2id)

# prints the datasets details
for dataset in datasets:
    send_output(str(dataset), 1)
'''
#########################################################################################
#########                                                                    ############
#########                     DEFINING MODELS AND TRAINING                   ############
#########                                                                    ############
#########################################################################################
'''

# building model
pos_model = POSTagger(
    CharBILSTM(CHAR_EMBEDDING_DIM, WORD_EMBEDDING_DIM, char2id),
    WordBILSTM(WORD_EMBEDDING_DIM), WordBILSTM(WORD_EMBEDDING_DIM),
    BILSTM_SIZE, datasets)
pos_model.to(device)