Esempio n. 1
0
        def test_fit_with_replace_mincount_min_count_ngram(fname):
            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=3,
                                              replace_word="<UNK>",
                                              size_word_n_gram=2,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)

            # word vocab related test
            assert len(dictionary.word_vocab) == 5  # c d e f <UNK>
            assert dictionary.size_word_vocab == 5
            assert dictionary.num_words == np.sum(np.arange(7))

            # n-gram related test
            assert len(
                dictionary.ngram_vocab) == 5  # c-c d-d e-e f-f <UNK>-<UNK>
            assert np.sum(dictionary.ngram_vocab.id2freq) == 2 + 3 + 4 + 5 + 1

            # label related test
            assert len(dictionary.label_vocab) == 6

            assert dictionary.size_total_vocab == 10
Esempio n. 2
0
        def test_fit_without_replacement_with_mincount(fname):
            dictionary = SupervisedDictionary(replace_OOV_word=False,
                                              min_count=1,
                                              replace_word="<UNK>",
                                              size_word_n_gram=2,
                                              word_n_gram_min_count=2,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)

            # word vocab related test
            assert len(dictionary.word_vocab) == 6  # a b c d e f
            assert dictionary.size_word_vocab == 6
            assert dictionary.num_words == np.sum(np.arange(7))

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 4  # c-c d-d e-e f-f
            assert np.sum(
                dictionary.ngram_vocab.id2freq) == np.sum(np.arange(6)) - 1

            # label related test
            assert len(dictionary.label_vocab) == 6

            assert dictionary.size_total_vocab == 10
Esempio n. 3
0
        def test_fit_with_replace_mincount(fname):
            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=3,
                                              replace_word="<UNK>",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="</s>")

            dictionary.fit(fname)
            # word vocab related test
            assert len(dictionary.word_vocab) == 6  # <UNK> c d e f </s>
            assert dictionary.size_word_vocab == 6
            assert dictionary.num_words == np.sum(np.arange(7)) + 6
            assert dictionary.size_total_vocab == 6

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 0

            # label related test
            assert len(dictionary.label_vocab) == 6
Esempio n. 4
0
        def test_without_ngram(fname):
            dictionary = SupervisedDictionary(replace_OOV_word=False,
                                              min_count=2,
                                              replace_word="",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            X, y = dictionary.transform(fname)
            assert len(X[0]) == 0
            assert y[0] == 0
            np.testing.assert_array_equal(X[-1], np.zeros(6, dtype=np.int64))
            recovered_sentence = dictionary.recover_sentence_from_ids(X[1])
            assert recovered_sentence == ["b", "b"]
Esempio n. 5
0
        def test_with_ngram(fname):
            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=3,
                                              replace_word="<UNK>",
                                              size_word_n_gram=2,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            X, y = dictionary.transform(fname)
            assert len(X[1]) == 3  # <unk>-<unk> <unk> <unk>
            recovered_sentence = dictionary.recover_sentence_from_ids(X[1])
            assert recovered_sentence == ["<UNK>", "<UNK>", "<UNK>-<UNK>"]

            recovered_sentence = dictionary.recover_sentence_from_ids(X[3])
            assert recovered_sentence == [
                "d", "d", "d", "d", "d-d", "d-d", "d-d"
            ]

            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=3,
                                              replace_word="<UNK>",
                                              size_word_n_gram=2,
                                              word_n_gram_min_count=3,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            X, y = dictionary.transform(fname)
            print(X)
            assert len(X[1]) == 2  # <unk> <unk>
            recovered_sentence = dictionary.recover_sentence_from_ids(X[1])
            assert recovered_sentence == ["<UNK>", "<UNK>"]

            recovered_sentence = dictionary.recover_sentence_from_ids(X[2])
            assert recovered_sentence == ["c", "c", "c"]
Esempio n. 6
0
        def test_predefined_vocab(fname):
            # min count == 1
            dictionary = SupervisedDictionary(replace_OOV_word=False,
                                              min_count=1,
                                              replace_word="",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB)

            # word vocab related test
            assert len(dictionary.word_vocab) == 3  # a b c
            assert dictionary.size_word_vocab == 3
            assert dictionary.num_words == 1 + 2 + 3
            assert dictionary.size_total_vocab == 3

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 0

            # label related test
            assert len(dictionary.label_vocab) == 6

            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=1,
                                              replace_word="<UNK>",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB)

            # word vocab related test
            assert len(dictionary.word_vocab) == 4  # a b c <UNK>
            assert dictionary.size_word_vocab == 4
            assert dictionary.num_words == np.sum(np.arange(7))
            assert dictionary.size_total_vocab == 4

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 0

            # label related test
            assert len(dictionary.label_vocab) == 6

            # min_count == 2
            dictionary = SupervisedDictionary(replace_OOV_word=False,
                                              min_count=2,
                                              replace_word="",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)
            dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB)

            # word vocab related test
            assert len(dictionary.word_vocab) == 2  # b c
            assert dictionary.size_word_vocab == 2
            assert dictionary.num_words == 2 + 3
            assert dictionary.size_total_vocab == 2

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 0

            # label related test
            assert len(dictionary.label_vocab) == 6

            dictionary = SupervisedDictionary(replace_OOV_word=True,
                                              min_count=2,
                                              replace_word="<UNK>",
                                              size_word_n_gram=1,
                                              word_n_gram_min_count=1,
                                              label_separator="\t",
                                              line_break_word="")

            dictionary.fit(fname)

            print(dictionary.word_vocab.id2word)
            print(dictionary.word_vocab.word2id)
            dictionary.update_vocab_from_word_set(PREDEFINED_VOCAB)

            print(dictionary.word_vocab.id2word)
            print(dictionary.word_vocab.word2id)
            # word vocab related test
            assert len(dictionary.word_vocab) == 3  # b c <UNK>
            assert dictionary.size_word_vocab == 3
            assert dictionary.num_words == np.sum(np.arange(7))
            assert dictionary.size_total_vocab == 3

            # n-gram related test
            assert len(dictionary.ngram_vocab) == 0

            # label related test
            assert len(dictionary.label_vocab) == 6
    def __init__(self, hydra_cfg, logger):
        self.logger = logger
        self.hydra_cfg = hydra_cfg
        self.seed = hydra_cfg['parameters']['seed']
        self.metric = hydra_cfg['parameters']['metric']

        self.device = torch.device(
            'cuda:{}'.format(hydra_cfg['parameters']['gpu_id']
                             ) if torch.cuda.is_available() else 'cpu')

        working_dir = utils.get_original_cwd() + '/'
        training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[
            'dataset']['train_fname']
        is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0

        # load embeddings
        pretrained_path = hydra_cfg['parameters']['pre_trained']
        pretrained_vocab = {}
        if pretrained_path:
            pretrained_path = working_dir + hydra_cfg['parameters'][
                'pre_trained']
            self.logger.info('Loading pre-trained word embeddings {}\n'.format(
                pretrained_path))
            pretrained_w2v = KeyedVectors.load_word2vec_format(
                fname=pretrained_path)
            pretrained_vocab = set(pretrained_w2v.vocab.keys())
            assert hydra_cfg['parameters']['ngram'] == 1

        self.dictionary = SupervisedDictionary(
            replace_OOV_word=is_replaced_OOV,
            min_count=hydra_cfg['parameters']['min_count'],
            replace_word='<OOV>',
            size_word_n_gram=hydra_cfg['parameters']['ngram'],
            word_n_gram_min_count=hydra_cfg['parameters']
            ['word_n_gram_min_count'],
            label_separator=hydra_cfg['parameters']['label_separator'],
            line_break_word='')

        self.logger.info('Use {}\n'.format(self.device))

        self.dictionary.fit(training_path)

        if pretrained_vocab:
            self.dictionary.update_vocab_from_word_set(pretrained_vocab)

        self.train_set, self.val_set = get_datasets(
            cfg=hydra_cfg,
            dictionary=self.dictionary,
            working_dir=working_dir,
            training_path=training_path,
            include_test=False)

        pretrained_word_vectors = None
        dim = self.hydra_cfg['parameters']['dim']

        self.pooling = self.hydra_cfg['parameters']['pooling']

        OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov']
        self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0

        if pretrained_word_vectors:
            pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings(
                pretrained_w2v,
                self.dictionary,
                OOV_initialized_method,
                rnd=np.random.RandomState(self.seed))
            dim = pretrained_word_vectors.shape[1]
        self.pretrained_word_vectors = pretrained_word_vectors
        self.dim = dim

        self.logger.info('#training_data: {}, #val_data: {}\n'.format(
            len(self.train_set), len(self.val_set)))
        self.logger.info(
            'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n'
            .format(self.dictionary.size_word_vocab,
                    self.dictionary.size_ngram_vocab,
                    self.dictionary.size_total_vocab))
class Objective(object):
    def __init__(self, hydra_cfg, logger):
        self.logger = logger
        self.hydra_cfg = hydra_cfg
        self.seed = hydra_cfg['parameters']['seed']
        self.metric = hydra_cfg['parameters']['metric']

        self.device = torch.device(
            'cuda:{}'.format(hydra_cfg['parameters']['gpu_id']
                             ) if torch.cuda.is_available() else 'cpu')

        working_dir = utils.get_original_cwd() + '/'
        training_path = working_dir + hydra_cfg['dataset']['path'] + hydra_cfg[
            'dataset']['train_fname']
        is_replaced_OOV = hydra_cfg['parameters']['replace_OOV'] > 0

        # load embeddings
        pretrained_path = hydra_cfg['parameters']['pre_trained']
        pretrained_vocab = {}
        if pretrained_path:
            pretrained_path = working_dir + hydra_cfg['parameters'][
                'pre_trained']
            self.logger.info('Loading pre-trained word embeddings {}\n'.format(
                pretrained_path))
            pretrained_w2v = KeyedVectors.load_word2vec_format(
                fname=pretrained_path)
            pretrained_vocab = set(pretrained_w2v.vocab.keys())
            assert hydra_cfg['parameters']['ngram'] == 1

        self.dictionary = SupervisedDictionary(
            replace_OOV_word=is_replaced_OOV,
            min_count=hydra_cfg['parameters']['min_count'],
            replace_word='<OOV>',
            size_word_n_gram=hydra_cfg['parameters']['ngram'],
            word_n_gram_min_count=hydra_cfg['parameters']
            ['word_n_gram_min_count'],
            label_separator=hydra_cfg['parameters']['label_separator'],
            line_break_word='')

        self.logger.info('Use {}\n'.format(self.device))

        self.dictionary.fit(training_path)

        if pretrained_vocab:
            self.dictionary.update_vocab_from_word_set(pretrained_vocab)

        self.train_set, self.val_set = get_datasets(
            cfg=hydra_cfg,
            dictionary=self.dictionary,
            working_dir=working_dir,
            training_path=training_path,
            include_test=False)

        pretrained_word_vectors = None
        dim = self.hydra_cfg['parameters']['dim']

        self.pooling = self.hydra_cfg['parameters']['pooling']

        OOV_initialized_method = self.hydra_cfg['parameters']['initialize_oov']
        self.is_freeze = self.hydra_cfg['parameters']['freeze'] > 0

        if pretrained_word_vectors:
            pretrained_word_vectors = initialise_word_embeddigns_from_pretrained_embeddings(
                pretrained_w2v,
                self.dictionary,
                OOV_initialized_method,
                rnd=np.random.RandomState(self.seed))
            dim = pretrained_word_vectors.shape[1]
        self.pretrained_word_vectors = pretrained_word_vectors
        self.dim = dim

        self.logger.info('#training_data: {}, #val_data: {}\n'.format(
            len(self.train_set), len(self.val_set)))
        self.logger.info(
            'In training data, the size of word vocab: {} ngram vocab: {}, total: {} \n'
            .format(self.dictionary.size_word_vocab,
                    self.dictionary.size_ngram_vocab,
                    self.dictionary.size_total_vocab))

    def __call__(self, trial: optuna.Trial):
        torch.manual_seed(self.seed)
        random.seed(self.seed)

        train_data_loader, val_data_loader = datasets2data_loaders(
            self.train_set, self.val_set, test_set=None, num_workers=1)

        epochs = self.hydra_cfg['parameters']['epochs']

        # Calculate an objective value by using the extra arguments.
        model = SupervisedFastText(V=self.dictionary.size_total_vocab,
                                   num_classes=len(
                                       self.dictionary.label_vocab),
                                   embedding_dim=self.dim,
                                   pretrained_emb=self.pretrained_word_vectors,
                                   freeze=self.is_freeze,
                                   pooling=self.pooling).to(self.device)

        initial_lr = trial.suggest_loguniform(
            'lr', self.hydra_cfg['optuna']['lr_min'],
            self.hydra_cfg['optuna']['lr_max'])

        optimizer = optim.SGD(model.parameters(), lr=initial_lr)

        # parameters for update learning rate
        num_tokens = self.dictionary.num_words

        learning_rate_schedule = self.hydra_cfg['parameters']['lr_update_rate']
        total_num_processed_tokens_in_training = epochs * num_tokens
        num_processed_tokens = 0
        local_processed_tokens = 0
        N = len(train_data_loader.dataset)

        best_val_loss = np.finfo(0.).max
        best_val_acc = np.finfo(0.).min
        save_fname = os.getcwd() + '/' + '{}.pt'.format(
            trial.number)  # file name to store best model's weights

        for epoch in range(epochs):
            # begin training phase
            sum_loss = 0.
            correct = 0
            model.train()

            for sentence, label, n_tokens in train_data_loader:
                sentence, label = sentence.to(self.device), label.to(
                    self.device)
                optimizer.zero_grad()
                output = model(sentence)
                loss = F.nll_loss(output, label)
                loss.backward()
                optimizer.step()
                pred = output.argmax(1, keepdim=False)
                correct += pred.eq(label).sum().item()
                sum_loss += loss.item()

                # update learning rate
                # ref: https://github.com/facebookresearch/fastText/blob/6d7c77cd33b23eec26198fdfe10419476b5364c7/src/fasttext.cc#L656
                local_processed_tokens += n_tokens.item()
                if local_processed_tokens > learning_rate_schedule:
                    num_processed_tokens += local_processed_tokens
                    local_processed_tokens = 0
                    progress = num_processed_tokens / total_num_processed_tokens_in_training
                    optimizer.param_groups[0]['lr'] = initial_lr * (1. -
                                                                    progress)

            train_loss = sum_loss / N
            train_acc = correct / N
            # end training phase

            val_loss, val_acc = evaluation(model, self.device, val_data_loader)

            progress = num_processed_tokens / total_num_processed_tokens_in_training  # approximated progress
            self.logger.info(
                '\rProgress: {:.1f}% Avg. train loss: {:.4f}, train acc: {:.1f}%, '
                'Avg. val loss: {:.4f}, val acc: {:.1f}%'.format(
                    progress * 100., train_loss, train_acc * 100, val_loss,
                    val_acc * 100))

            if self.metric == 'loss':
                trial.report(val_loss, epoch)
            else:
                trial.report(val_acc, epoch)

            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

            # validation
            is_saved_model = False
            if self.metric == 'loss':
                if best_val_loss > val_loss:
                    best_val_loss = val_loss
                    best_val_acc = val_acc
                    is_saved_model = True
            else:
                if best_val_acc < val_acc:
                    best_val_loss = val_loss
                    best_val_acc = val_acc
                    is_saved_model = True

            if is_saved_model:
                torch.save(model.state_dict(), save_fname)

        trial.set_user_attr('val_loss', best_val_loss)
        trial.set_user_attr('val_acc', best_val_acc)
        trial.set_user_attr('model_path', save_fname)

        if self.metric == 'loss':
            return best_val_loss
        else:
            return best_val_acc