Exemple #1
0
class ACLIMDB(BaseData):
    # root = 'data/aclImdb/'
    config = ConfigRNN.instance()
    root = os.path.join('data', 'aclImdb')
    data = None

    def __init__(self, batch_size, embed_method, is_eval, debug):
        BaseData.__init__(self)
        self.batch_size = batch_size
        self.embed_method = embed_method
        self.data = Imdb(root=self.root,
                         embed_method=self.embed_method,
                         train=not is_eval,
                         debug=debug)

    def load(self):
        additional_options = {
            'num_workers': 0,
            'pin_memory': True
        } if self.cuda else {}
        # TODO(hyungsun): make this class adapt word embedding dynamically.
        return torch.utils.data.DataLoader(self.data,
                                           batch_size=self.batch_size,
                                           shuffle=self.config.SHUFFLE,
                                           drop_last=True,
                                           **additional_options)
Exemple #2
0
class RNN(nn.Module):
    """TODO(hyungsun): Let model classes have optimizer and loss function.
    """
    config = ConfigRNN.instance()

    def __init__(self, pretrained):
        super(RNN, self).__init__()
        self.cuda = torch.cuda.is_available()
        self.device = torch.device('cuda' if self.cuda else 'cpu')
        self.embed = nn.Embedding.from_pretrained(pretrained)
        self.lstm = nn.LSTM(self.config.EMBED_SIZE, self.config.HIDDEN_SIZE)
        self.linear = nn.Linear(self.config.HIDDEN_SIZE,
                                self.config.OUTPUT_SIZE)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        hidden, cell = self.init_hidden()
        embed = self.embed(inputs).squeeze(2)
        out, (hidden, cell) = self.lstm(embed, (hidden, cell))
        linear = self.linear(out)
        output = self.softmax(linear[-1])
        return output, hidden, cell

    def init_hidden(self):
        hidden = Variable(
            torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        cell = Variable(
            torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        if self.cuda:
            hidden = hidden.cuda()
            cell = cell.cuda()

        return hidden, cell
Exemple #3
0
def main():
    config = ConfigRNN.instance()
    loader = ACLIMDB(batch_size=config.BATCH_SIZE,
                     embed_method=config.EMBED_METHOD,
                     is_eval=config.EVAL_MODE,
                     debug=config.CONSOLE_LOGGING)
    embedding_model = loader.data.embedding_model

    # TODO(hyungsun): This code is temporal. Erase this later.
    if config.SAVE_EMBED_MODEL:
        embedding_model.save("embed_model.wv")
        return
    if embedding_model == "DEFAULT":
        model = RNN()
    else:
        vectors = loader.data.embedding_model.wv.vectors

        # Add padding for masking.
        vectors = np.append(np.array([100 * [0]]), vectors, axis=0)
        model = RNN(torch.from_numpy(vectors).float())

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY)
    trainer = RNNTrainer(model, loader, optimizer)
    if config.EVAL_MODE:
        trainer.evaluate()
    else:
        trainer.train(config.MAX_EPOCH)
Exemple #4
0
class Embed:
    embed_model_path = "embedmodel"
    embed_model_name = "model.wv"
    config = ConfigRNN.instance()
    embedding_model = None

    def __init__(self):
        # Load embedding model.
        try:
            saved_file = os.path.join(self.embed_model_path, self.embed_model_name)
            self.embedding_model = KeyedVectors.load(saved_file, mmap='r')
        except FileExistsError:
            pass
        words = self.embedding_model.wv.index2entity
        self.word_to_idx = {words[i]: i for i in range(0, len(words))}

    def get_embedding_model(self):
        return self.embedding_model

    def review2vec(self, review):
        word_vectors = []
        for word in review.split():
            print(word)
            alphabetic_word = to_alphabetic(word)
            if len(alphabetic_word) == 0:
                continue
            try:
                word_vectors.append([self.word_to_idx[alphabetic_word]])
            except KeyError:
                # print('An excluded word:', alphabetic_word)
                pass
        return torch.from_numpy(np.array([word_vectors])).long()
Exemple #5
0
class RNNEvaluator(Evaluator):
    config = ConfigRNN.instance()

    def __init__(self, model, optimizer):
        super().__init__(model, optimizer)
        if self.config.LOGGING_ENABLE:
            from tensor_board_logger import TensorBoardLogger
            self.logger = TensorBoardLogger(
                os.path.join("logs", model.__class__.__name__))

        self.current_epoch = 0
        self.model.eval()

        # Load model & optimizer.
        checkpoint = self.load_checkpoint()
        try:
            self.optimizer.load_state_dict(checkpoint["optimizer"])
            # 89527, 100 -> 271, 100
            self.model.load_state_dict(checkpoint["model"])
        except KeyError:
            # There is no checkpoint
            pass

    def evaluate(self, review_vector):
        with torch.no_grad():
            review_vector = review_vector.to(torch.device(self.device_name))
            input_data = review_vector.view(
                -1, 1,
                1)  # (num of words / batch size) * batch size * index size(1)
            output, _, _ = self.model(input_data)
            prediction = output.max(
                1, keepdim=True)[1]  # get the index of the max log-probability
        if prediction.eq(torch.tensor([1, 0])):
            return POSITIVE
        return NEGATIVE
Exemple #6
0
def prepare():
    config = ConfigRNN.instance()
    embed = Embed()
    embedding_model = embed.get_embedding_model()
    if config.EMBED_METHOD == "DEFAULT":
        model = ReviewParser()
    else:
        model = ReviewParser(
            pretrained=torch.from_numpy(embedding_model.wv.vectors).float())
    evaluator = RNNEvaluator(model)
    return evaluator, embed
Exemple #7
0
def main():
    config = ConfigRNN.instance()
    loader = ACLIMDB(batch_size=config.BATCH_SIZE,
                     word_embedding=config.WORD_EMBEDDING,
                     is_eval=False,
                     debug=config.DEBUG_MODE)
    vectors = loader.data.embedding_model.wv.vectors
    model = RNN(torch.from_numpy(vectors).float())

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY)
    trainer = RNNTrainer(model, loader, optimizer)
    trainer.train(config.MAX_EPOCH, config.BATCH_SIZE)
Exemple #8
0
class RNN(nn.Module):
    config = ConfigRNN.instance()

    def __init__(self, pretrained=None):
        super(RNN, self).__init__()
        if pretrained is None:
            self.embed = nn.Embedding(self.config.VOCAB_SIZE, self.config.EMBED_SIZE)
        else:
            self.embed = nn.Embedding.from_pretrained(pretrained)
        self.lstm = nn.LSTM(self.config.EMBED_SIZE, self.config.HIDDEN_SIZE)
        self.linear = nn.Linear(self.config.HIDDEN_SIZE, self.config.OUTPUT_SIZE)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, target):
        # 문장 길이를 쉽게 뽑아내기 위해 전처리로 permute 와 squeeze를 한다.
        _inputs = inputs.permute(1, 0, 2).squeeze(2)
        input_lengths = torch.LongTensor([torch.max(_inputs[i, :].data.nonzero()) + 1 for i in range(_inputs.size(0))])
        input_lengths, sorted_idx = input_lengths.sort(0, descending=True)

        # 워드 인덱스 텐서가 문장 길이가 긴 순서대로 정렬된다.
        input_seq2idx = _inputs[sorted_idx]

        # 워드 인덱스 텐서를 원드 벡터 텐서로 변환.
        embedded = self.embed(input_seq2idx)

        # packed_input 으로 뽑아낸 hidden은 padding 을 전부 스킵한다.
        output, (hidden, cell) = self.lstm(embedded, (self.init_hidden()))

        # hidden은 output 과 같기 때문에 packed_output을 다시 pad 해서 넣는 것이 아닌, hidden을 넣는다.
        linear = self.linear(output)

        # Soft max.
        output = self.softmax(linear[-1])

        return output, hidden, cell

    def init_hidden(self):
        hidden = Variable(torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        cell = Variable(torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        if torch.cuda.is_available():
            hidden = hidden.cuda()
            cell = cell.cuda()

        return hidden, cell
Exemple #9
0
def main():
    config = ConfigRNN.instance()
    loader = ACLIMDB(batch_size=config.BATCH_SIZE,
                     embed_method=config.EMBED_METHOD,
                     is_eval=False,
                     debug=config.DEBUG_MODE)
    embedding_model = loader.data.embedding_model
    if embedding_model == "DEFAULT":
        vectors = loader.data.embedding_model.wv.vectors

        # Add padding for masking.
        vectors = np.append(np.array([100 * [0]]), vectors, axis=0)
        model = RNN(torch.from_numpy(vectors).float())
    else:
        model = RNN()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY)
    trainer = RNNTrainer(model, loader, optimizer)
    trainer.train(config.MAX_EPOCH, config.BATCH_SIZE)
Exemple #10
0
class RNNEvaluator(Evaluator):
    config = ConfigRNN.instance()

    def __init__(self, model):
        super().__init__(model)
        self.current_epoch = 0
        # self.model.eval()

        # Load model & optimizer.
        checkpoint = self.load_checkpoint()
        try:
            self.model.load_state_dict(checkpoint["model"])
        except KeyError:
            # There is no checkpoint
            pass

    def evaluate(self, review_vectors):
        with torch.no_grad():
            for review_vector in review_vectors:
                input_data = review_vector.to(torch.device(self.device_name))
                return self.model(input_data)
Exemple #11
0
def main():
    config = ConfigRNN.instance()
    embed = Embed()
    # TODO(kyungsoo): Make this working.
    embedding_model = embed.get_embedding_model()
    if embedding_model == "DEFAULT":
        model = RNN()
    else:
        vectors = embedding_model.wv.vectors

        # Add padding for masking.
        vectors = np.append(np.array([100 * [0]]), vectors, axis=0)
        model = RNN(torch.from_numpy(vectors).float())

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY)
    trainer = RNNEvaluator(model, optimizer)

    # TODO(kyungsoo): Make this working.
    review_vector = embed.review2vec(sys.argv[0])
    print(trainer.evaluate(review_vector=review_vector))
Exemple #12
0
class RNNTrainer(Trainer):
    config = ConfigRNN.instance()

    def __init__(self, model, data_loader, optimizer):
        super().__init__(model, data_loader, optimizer)
        if self.config.LOGGING_ENABLE:
            from tensor_board_logger import TensorBoardLogger
            self.logger = TensorBoardLogger(
                os.path.join("logs", model.__class__.__name__))

        self.current_epoch = 0

    def train(self, max_epoch, batch_size):
        print("Training started")
        if torch.cuda.is_available():
            self.model = self.model.cuda()

        # Set model to train mode.
        self.model.train()
        epoch_resume = 0
        if self.config.CHECKPOINT_ENABLE:
            checkpoint = self.load_checkpoint()
            try:
                epoch_resume = checkpoint["epoch"]
                self.optimizer.load_state_dict(checkpoint["optimizer"])
                self.model.load_state_dict(checkpoint["model"])
            except KeyError:
                # There is no checkpoint
                pass
        for epoch in range(epoch_resume, max_epoch):
            accuracy_sum = 0
            loss_sum = 0
            self.current_epoch = epoch
            for batch_idx, (_data, target) in enumerate(self.data_loader):
                # Transpose vector to make it (num of words / batch size) * batch size * index size(1).
                _data = np.transpose(_data, (1, 0, 2))
                _data, target = _data.to(device=self.device), target.to(
                    device=self.device)

                # Initialize the gradient of model
                self.optimizer.zero_grad()
                output, hidden, cell = self.model(_data)
                loss = self.config.CRITERION(output, target)
                loss.backward()
                self.optimizer.step()
                if self.config.DEBUG_MODE:
                    print("Train Epoch: {}/{} [{}/{} ({:.0f}%)]".format(
                        epoch, max_epoch, batch_idx * len(_data),
                        len(self.data_loader.dataset),
                        100. * batch_idx / len(self.data_loader)))
                    print("Loss: {:.6f}".format(loss.item()))
                    print("target : ", target)
                    print("output : ", output, end="\n\n")
                accuracy = self.get_accuracy(target, output)
                accuracy_sum += accuracy
                loss_sum += loss
            if self.config.LOGGING_ENABLE:
                if len(self.data_loader) == 0:
                    raise Exception("Data size is smaller than batch size.")
                loss_avg = loss_sum / len(self.data_loader)
                accuracy_avg = accuracy_sum / len(self.data_loader)
                # TODO(kyungsoo): Make Tensorboard automatically execute when train.py runs if it is possible
                self.logger.log(loss_avg, accuracy_avg,
                                self.model.named_parameters(),
                                self.current_epoch)
                self.save_checkpoint({
                    "epoch": epoch + 1,
                    "model": self.model.state_dict(),
                    "optimizer": self.optimizer.state_dict(),
                })
        print("End")

    def evaluate(self, batch_size):
        print("Evaluation started")

        # Set model to eval mode.
        self.model.eval()
        if self.config.CHECKPOINT_ENABLE:
            checkpoint = self.load_checkpoint()
            try:
                self.optimizer.load_state_dict(checkpoint["optimizer"])
                self.model.load_state_dict(checkpoint["model"])
            except KeyError:
                # There is no checkpoint
                pass

        test_loss = 0
        correct = 0
        with torch.no_grad():
            for _data, target in self.data_loader:
                _data, target = _data.to(self.device), target.to(self.device)
                input_data = _data.view(
                    -1, batch_size, 1
                )  # (num of words / batch size) * batch size * index size(1)
                output, _, _ = self.model(input_data)
                test_loss += self.config.CRITERION(
                    output, target).item()  # sum up batch loss
                prediction = output.max(1, keepdim=True)[
                    1]  # get the index of the max log-probability
                correct += prediction.eq(
                    target.view_as(prediction)).sum().item()
        test_loss /= len(self.data_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.
              format(test_loss, correct, len(self.data_loader.dataset),
                     100. * correct / len(self.data_loader.dataset)))
        print("End")
Exemple #13
0
class Imdb(data.Dataset):
    """Note: This class assume that the data is in root directory already just for now.
     Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
    """
    processed_folder = 'processed'
    pickled_folder = 'pickled'
    pickle_file = 'sentences.pickle'
    training_file = 'training.pt'
    test_file = 'test.pt'
    bow_file = 'labeledBow.feat'
    vocab_file = 'imdb.vocab'
    config = ConfigRNN.instance()

    # Constants for word embedding.
    embedding_dimension = config.EMBED_SIZE

    # Pre-trained embedding model for nn.Embedding.
    # See https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding.from_pretrained
    embedding_model = None
    lengths = None

    def __init__(self, root, embed_method, train=True, debug=False):
        self.root = os.path.expanduser(root)
        self.train = train  # training set or test set
        self.max_num_words = 0  # To make a 2-dimensional tensor with an uneven list of vectors
        self.debug_mode = debug

        if self.debug_mode:
            self.processed_folder = 'debug_' + self.processed_folder
            self.pickled_folder = 'debug_' + self.pickled_folder

        self.pickle_path = os.path.join(self.root, self.pickled_folder)
        self.processed_path = os.path.join(self.root, self.processed_folder)

        if self.debug_mode:
            try:
                shutil.rmtree(self.pickle_path)
                shutil.rmtree(self.processed_path)
            except FileNotFoundError:
                pass

        if not self._check_exists():
            self.download()

        if embed_method == 'CBOW':
            sg = 0
        elif embed_method == 'SKIP_GRAM':
            sg = 1
        elif embed_method == 'DEFAULT':
            sg = None
        else:
            print(embed_method, "is not supported.")
            return

        if sg is None:
            words = self.extract_words()
        else:
            self.embedding_model = word2vec.Word2Vec(
                sentences=self.extract_sentences(),
                size=self.embedding_dimension,
                window=2,
                min_count=5,
                workers=12,
                iter=100,
                sg=sg,
            )
            words = self.embedding_model.wv.index2entity

            # Insert pre-defined padding word to mask while training.
            words.insert(0, self.config.PAD_WORD)

        self.word_to_idx = {words[i]: i for i in range(0, len(words))}

        if not self._check_processed():
            self.pre_process(embed_method)

        if self.train:
            self.train_data, self.train_labels = torch.load(
                os.path.join(self.root, self.processed_folder,
                             self.training_file))
        else:
            self.test_data, self.test_labels = torch.load(
                os.path.join(self.root, self.processed_folder, self.test_file))

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (vector, target) where target is index of the target class.
        """
        if self.train:
            vector, target = self.train_data[index], self.train_labels[index]
        else:
            vector, target = self.test_data[index], self.test_labels[index]
        return vector, target

    def extract_words(self):
        pickle_path = os.path.join(self.root, self.pickled_folder)
        pickle_file = 'words.pickle'

        if self.debug_mode:
            try:
                shutil.rmtree(pickle_path)
            except FileNotFoundError:
                pass

        try:
            with open(os.path.join(pickle_path, pickle_file), 'rb') as f:
                print("Sentences will be loaded from pickled file: " +
                      pickle_file)
                return pickle.load(f)
        except FileNotFoundError:
            print("Cannot find pickled file to load sentences.")
            pass
        except Exception as error:
            raise error

        print("Extracting words...")
        words = set()
        for mode in ['train', 'test']:
            for classification in ['pos', 'neg', 'unsup']:
                if mode == 'test' and classification == 'unsup':
                    # There is no test/unsup in our data.
                    continue
                file_path = os.path.join(self.root, mode, classification)
                for root, dirs, files in os.walk(file_path):
                    test_index = 0
                    for file in files:
                        test_index += 1
                        if self.debug_mode and test_index > TEST_DATA_SIZE:
                            break
                        with open(os.path.join(file_path, file)) as f:
                            sentences = f.readlines()
                            for sentence in sentences:
                                new_words = set(word_tokenize(sentence))
                                words = words.union(new_words)
        alphabetic_words = []
        for word in words:
            word = to_alphabetic(word)
            if len(word) > 0:
                alphabetic_words.append(word)
        try:
            os.mkdir(pickle_path)
        except FileExistsError:
            # 'processed' folder already exists.
            pass

        with open(os.path.join(pickle_path, pickle_file), 'wb') as f:
            pickle.dump(words, f, pickle.HIGHEST_PROTOCOL)

        print("Done.")
        return alphabetic_words

    def extract_sentences(self):
        """Extract sentences from data set for Word2Vec model.
        See https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec for detail.

        :return: sentences type of list of list.
        """
        try:
            with open(os.path.join(self.pickle_path, self.pickle_file),
                      'rb') as f:
                print("Sentences will be loaded from pickled file: " +
                      self.pickle_file)
                return pickle.load(f)
        except FileNotFoundError:
            print("Cannot find pickled file to load sentences.")
            pass
        except Exception as error:
            raise error

        print("Extracting sentences...")
        sentences = []
        for mode in ['train', 'test']:
            for classification in ['pos', 'neg', 'unsup']:
                if mode == 'test' and classification == 'unsup':
                    # There is no test/unsup in our data.
                    continue
                path = os.path.join(self.root, mode, classification)
                # sentences would be 12,500 review data sentences list.
                test_index = 0
                for sentence in word2vec.PathLineSentences(path):
                    test_index += 1
                    if self.debug_mode and test_index > TEST_DATA_SIZE:
                        break

                    alphabetic_words = list(
                        map(lambda x: to_alphabetic(x), sentence))
                    words = list(
                        filter(lambda x: len(x) != 0, alphabetic_words))
                    sentences += words
        # Sentences look like [[review.split()], [...], ...].
        sentences = [sentences]
        try:
            os.mkdir(self.pickle_path)
        except FileExistsError:
            # 'processed' folder already exists.
            pass

        with open(os.path.join(self.pickle_path, self.pickle_file), 'wb') as f:
            pickle.dump(sentences, f, pickle.HIGHEST_PROTOCOL)

        print("Done.")
        return sentences

    def make_vectors_w2v(self, path):
        partial_vectors = []
        sentences = word2vec.LineSentence(path)
        for sentence in sentences:
            word_vectors = []
            for word in sentence:
                alphabetic_word = to_alphabetic(word)
                if len(alphabetic_word) == 0:
                    continue
                try:
                    word_vectors.append([self.word_to_idx[alphabetic_word]])
                except KeyError:
                    # print('An excluded word:', alphabetic_word)
                    pass
            self.max_num_words = max(self.max_num_words, len(word_vectors))
            partial_vectors.append(
                torch.from_numpy(np.array(word_vectors)).long())
        return partial_vectors

    def make_vectors_default(self, path):
        partial_vectors = []
        with open(path) as f:
            sentences = f.readlines()
            for sentence in sentences:
                word_vectors = []
                words = word_tokenize(sentence)
                for word in words:
                    alphabetic_word = to_alphabetic(word)
                    if len(alphabetic_word) == 0:
                        continue
                    try:
                        word_vectors.append(
                            [self.word_to_idx[alphabetic_word]])
                    except KeyError:
                        # print('An excluded word:', alphabetic_word)
                        pass

                self.max_num_words = max(self.max_num_words, len(word_vectors))
                partial_vectors.append(
                    torch.from_numpy(np.array(word_vectors)).long())
        return partial_vectors

    def make_vectors_default(self, path):
        partial_vectors = []
        with open(path) as f:
            sentences = f.readlines()
            for sentence in sentences:
                word_vectors = []
                words = word_tokenize(sentence)
                for word in words:
                    alphabetic_word = to_alphabetic(word)
                    if len(alphabetic_word) == 0:
                        continue
                    try:
                        word_vectors.append(
                            [self.word_to_idx[alphabetic_word]])
                    except KeyError:
                        # print('An excluded word:', alphabetic_word)
                        pass
                self.max_num_words = max(self.max_num_words, len(word_vectors))
                partial_vectors.append(
                    torch.from_numpy(np.array(word_vectors)).long())
        return partial_vectors

    def pre_process(self, embed_method):
        """Select a pre-process function to execute and save the result in file system.
        """
        print("Processing...")
        padding_value = 0
        training_set, test_set = None, None
        for mode in ['train', 'test']:
            grades, vectors = [], []
            for classification in ['pos', 'neg']:
                for root, dirs, files in os.walk(
                        os.path.join(self.root, mode, classification)):
                    test_index = 0
                    for file_name in files:
                        test_index += 1
                        if self.debug_mode and test_index > TEST_DATA_SIZE:
                            break

                        # Get grade from filename such as "0_3.txt"
                        grade = 0 if int(
                            file_name.split('_')[1][:-4]) > 5 else 1
                        grades.append(grade)
                        if embed_method == 'DEFAULT':
                            partial_vectors = self.make_vectors_default(
                                os.path.join(root, file_name))
                            vectors.extend(partial_vectors)
                        else:
                            partial_vectors = self.make_vectors_w2v(
                                os.path.join(root, file_name))
                            vectors.extend(partial_vectors)

            mode_set = (pad_sequence(vectors,
                                     max_len=self.max_num_words,
                                     batch_first=True,
                                     padding_value=padding_value),
                        torch.from_numpy(np.array(grades)).long())
            if mode == 'train':
                training_set = mode_set
            else:
                test_set = mode_set

        try:
            os.mkdir(os.path.join(self.root, self.processed_folder))
        except FileExistsError:
            # 'processed' folder already exists.
            pass

        with open(
                os.path.join(self.root, self.processed_folder,
                             self.training_file), 'wb') as f:
            torch.save(training_set, f)
        with open(
                os.path.join(self.root, self.processed_folder, self.test_file),
                'wb') as f:
            torch.save(test_set, f)
        print("Done.")

    def __len__(self):
        if self.train:
            return len(self.train_data)
        else:
            return len(self.test_data)

    def _check_exists(self):
        """Check if the dataset is downloaded."""
        return os.path.exists(os.path.join(self.root))

    def _check_processed(self):
        """Check if the dataset is preprocessed."""
        return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
            os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))

    def download(self):
        """Download the Imdb review data if it doesn't exist in processed_folder already."""
        # TODO(hyungsun): Implement if needed.
        pass

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        fmt_str += '    Split: {}\n'.format(
            'train' if self.train is True else 'test')
        fmt_str += '    Root Location: {}\n'.format(self.root)
        return fmt_str
Exemple #14
0
class RNNTrainer(Trainer):
    config = ConfigRNN.instance()

    def __init__(self, model, data_loader, optimizer):
        super().__init__(model, data_loader, optimizer)
        if self.config.BOARD_LOGGING:
            from tensor_board_logger import TensorBoardLogger
            self.logger = TensorBoardLogger(
                os.path.join("logs", model.__class__.__name__))

        self.current_epoch = 0

    def train(self, max_epoch):
        print("Training started")

        # Set model to train mode.
        self.model.train()
        epoch_resume = 0
        if self.config.SAVE_CHECKPOINT:
            checkpoint = self.load_checkpoint()
            try:
                epoch_resume = checkpoint["epoch"]
                self.optimizer.load_state_dict(checkpoint["optimizer"])
                self.model.load_state_dict(checkpoint["model"])
            except KeyError:
                # There is no checkpoint
                pass
        for epoch in range(epoch_resume, max_epoch):
            self.current_epoch = epoch
            for batch_idx, (_data, target) in enumerate(self.data_loader):
                # Transpose vector to make it (num of words / batch size) * batch size * index size(1).
                _data = np.transpose(_data, (1, 0, 2))
                _data, target = _data.to(device=self.device), target.to(
                    device=self.device)

                # Initialize the gradient of model
                self.optimizer.zero_grad()
                output, hidden, cell, sorted_target = self.model(_data, target)
                loss = self.config.CRITERION(output, sorted_target)
                loss.backward()
                self.optimizer.step()
                if self.config.CONSOLE_LOGGING:
                    print("Train Epoch: {}/{} [{}/{} ({:.0f}%)]".format(
                        epoch, max_epoch, batch_idx * _data.shape[1],
                        len(self.data_loader.dataset),
                        100. * batch_idx / len(self.data_loader)))
                    print("Loss: {:.6f}".format(loss.item()))
                    print("target : ", target)
                    print("output : ", output, end="\n\n")
                accuracy = self.get_accuracy(sorted_target, output)
            if self.config.BOARD_LOGGING:
                if len(self.data_loader) == 0:
                    raise Exception("Data size is smaller than batch size.")
                # TODO(kyungsoo): Make Tensorboard automatically execute when train.py runs if it is possible
                self.logger.log(loss.item(), accuracy,
                                self.model.named_parameters(),
                                self.current_epoch)
                self.save_checkpoint({
                    "epoch": epoch + 1,
                    "model": self.model.state_dict(),
                    "optimizer": self.optimizer.state_dict(),
                })
        print("End")

    def evaluate(self):
        print("Evaluation started")

        # Set model to eval mode.
        self.model.eval()
        checkpoint = self.load_checkpoint()
        try:
            self.optimizer.load_state_dict(checkpoint["optimizer"])
            self.model.load_state_dict(checkpoint["model"])
        except KeyError:
            raise Exception("No checkpoint to evaluate.")
            pass

        correct = 0
        with torch.no_grad():
            for _data, target in self.data_loader:
                # Transpose vector to make it (num of words / batch size) * batch size * index size(1).
                _data = np.transpose(_data, (1, 0, 2))
                _data, target = _data.to(device=self.device), target.to(
                    device=self.device)

                # Initialize the gradient of model
                self.optimizer.zero_grad()
                output, hidden, cell, sorted_target = self.model(_data, target)

                _, argmax = torch.max(output, 1)
                correct += (
                    sorted_target == argmax.squeeze()).nonzero().size(0)

        size = len(self.data_loader.dataset)
        print('\nAccuracy: {}/{} ({:.2f})%\n'.format(correct, size,
                                                     100. * correct / size))
        print("End")
Exemple #15
0
class RNN(nn.Module):
    """TODO(hyungsun): Let model classes have optimizer and loss function.
    """
    config = ConfigRNN.instance()

    def __init__(self, pretrained=None):
        super(RNN, self).__init__()
        self.cuda_available = torch.cuda.is_available()
        self.device = torch.device('cuda' if self.cuda_available else 'cpu')
        if pretrained is None:
            self.embed = nn.Embedding(self.config.VOCAB_SIZE,
                                      self.config.EMBED_SIZE)
        else:
            self.embed = nn.Embedding.from_pretrained(pretrained)
        self.lstm = nn.LSTM(self.config.EMBED_SIZE, self.config.HIDDEN_SIZE)
        self.linear = nn.Linear(self.config.HIDDEN_SIZE,
                                self.config.OUTPUT_SIZE)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, target):
        # 문장 길이를 쉽게 뽑아내기 위해 전처리로 permute 와 squeeze를 한다.
        _inputs = inputs.permute(1, 0, 2).squeeze(2)
        input_lengths = torch.LongTensor([
            torch.max(_inputs[i, :].data.nonzero()) + 1
            for i in range(_inputs.size(0))
        ])
        input_lengths, sorted_idx = input_lengths.sort(0, descending=True)

        # 워드 인덱스 텐서가 문장 길이가 긴 순서대로 정렬된다.
        input_seq2idx = _inputs[sorted_idx]

        # 문장이 정렬된 순서에 맞게 Target 값도 정렬한다.
        sorted_target = target[sorted_idx]

        # 워드 인덱스 텐서를 원드 벡터 텐서로 변환.
        embeded = self.embed(input_seq2idx)

        # valid한 hidden을 뽑기 위해 워드 인덱스를 패킹한다.
        packed_input = pack_padded_sequence(embeded,
                                            input_lengths,
                                            batch_first=True)

        # packed_input 으로 뽑아낸 hidden은 padding 을 전부 스킵한다.
        packed_output, (hidden, cell) = self.lstm(packed_input,
                                                  (self.init_hidden()))

        # hidden은 output 과 같기 때문에 packed_output을 다시 pad 해서 넣는 것이 아닌, hidden을 넣는다.
        linear = self.linear(hidden)

        # Soft max.
        output = self.softmax(linear.squeeze())

        return output, hidden, cell, sorted_target

    def init_hidden(self):
        hidden = Variable(
            torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        cell = Variable(
            torch.zeros(1, self.config.BATCH_SIZE, self.config.HIDDEN_SIZE))
        if self.cuda_available:
            hidden = hidden.cuda()
            cell = cell.cuda()

        return hidden, cell