Exemple #1
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        if self.tokenizer_dir:
            self.__src_tokenizer = load_pkl(
                get_file_path(data_dir, 'tokenizer', self.tokenizer_dir,
                              'tokenizer.pkl'))
            self.__tar_tokenizer = self.__src_tokenizer

        elif self.M.checkpoint_params['load_model']:
            load_model_params = self.M.checkpoint_params['load_model']

            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

        else:
            self.__src_tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.__tokenizer_data_src,
                self.__tokenizer_data_tar,
                self.M.data_params,
            )
            self.__tar_tokenizer = self.__src_tokenizer
            del self.__tokenizer_data_src
            del self.__tokenizer_data_tar

        params = {
            **self.M.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__train_src, self.__train_tar, params)

        self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__val_src, self.__val_tar, params)

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
Exemple #2
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        load_model_params = Model.checkpoint_params['load_model']
        if load_model_params:
            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

            self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
                Model.encode_pipeline, self.__train_src, self.__train_tar, {
                    **Model.data_params,
                    'tokenizer': self.__src_tokenizer,
                    'src_tokenizer': self.__src_tokenizer,
                    'tar_tokenizer': self.__tar_tokenizer,
                })

        else:
            self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline(
                Model.preprocess_pipeline,
                self.__train_src,
                self.__train_tar,
                Model.data_params,
            )

        params = {
            **Model.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            Model.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
Exemple #3
0
    def log(self, kwargs):
        string = '\n'.join(
            list(
                map(lambda x: '{}: {}'.format(x[0], x[1]),
                    list(kwargs.items()))))
        data = (self.model.name, self.model.TIME, string,
                self.model.data_params, self.model.model_params,
                self.model.train_params, self.__train_time,
                self.__test_train_time, self.__test_test_time)

        string = '\n---------------------------------------------------' \
                 '\nmodel_name: {}\nmodel_time: {}\n{}\n' \
                 'data_params: {}\nmodel_params: {}\ntrain_params: {}\n' \
                 'train_time: {}\ntest_train_time: {}\ntest_test_time: {}\n\n'.format(*data)

        print(string)

        with open(
                os.path.join(create_dir_in_root('runtime', 'log'),
                             '{}.log'.format(self.model.name)), 'ab') as f:
            f.write(string.encode('utf-8'))
    def train(self, train_x, train_y, val_x=None, val_y=None):
        """
        TODO
            you need to override this train functions

        :params: train_x (tuple): a tuple with two elements, each one is a np.array
                                    (encoder_input, decoder_input)
                                    e.g., (train_src_encode, train_tar_encode[:, :-1])
        :params: train_y (np.array): decoder_output (ground_truth)
                                    train_tar_encode[:, 1:]
        :params: val_x (tuple): Use for validation, could be None.
                                If None, then no validation; if it is not None, it would be the same as train_X
        :params: val_y (np.array): Use for validation, could be None.
                                If None, then no validation; if it is not None, it would be the same as train_y

        no returns
        """

        # if we want to load a trained model
        if self.checkpoint_params['load_model']:
            model_dir = create_dir_in_root(
                *(['runtime', 'models'] +
                  self.checkpoint_params['load_model']))
            self.load_model(model_dir)

        if not self.__finish_train:
            # TODO
            #   you should write down the train process here
            #   This process should also provide the following functions:
            #       calculate the loss
            #       update model weights
            #       early stop
            #       validation
            #       something like tensorboard (for saving files for tensorboard, you should save it to self.tb_dir)
            #       print out the results to console for each epoch
            #       save the best model to self.model_dir
            # ...
            #
            # for step in range(steps)
            # ...

            # load the best model so that it could be tested
            # from torchtext.data import Field, BucketIterator, TabularDataset
            # import load.jr_en as loader
            # JP_TEXT = Field(tokenize=jr_en.jr_tokenizer)
            # EN_TEXT = Field(tokenize=jr_en.en_tokenizer)
            # print("Preparing to load data to batchify data")
            # dataLoader = loader.Loader()
            # jr_data, en_data = dataLoader.data()
            # print("Successfully splited the data")
            # JP_TEXT.build_vocab(jr_data, val)
            # EN_TEXT.build_vocab(en_data, val)

            from torchtext.data import Field
            JP_TEXT = Field(tokenize=jr_en.jr_tokenizer)
            EN_TEXT = Field(tokenize=jr_en.en_tokenizer)

            ##batchify the data
            def batchify(data, batchsize, TEXT):
                data = TEXT.numericalize([data.examples[0].text])
                # Divide the dataset into bsz parts.
                nbatch = data.size(0) // batchsize
                # Trim off any extra elements that wouldn't cleanly fit (remainders).
                data = data.narrow(0, 0, nbatch * batchsize)
                # Evenly divide the data across the bsz batches.
                data = data.view(batchsize, -1).t().contiguous()
                return data.to(device)

            jr_data = batchify(train_x, self.train_params['batch_size'],
                               JP_TEXT)
            en_data = batchify(train_x, self.train_params['batch_size'],
                               EN_TEXT)
            emb_dim = self.model_params["emb_dim"]

            def get_batch(source, i):
                seq_len = emb_dim
                data = source[i:i + seq_len]
                target = source[i + 1:i + 1 + seq_len].view(-1)
                return data, target

            import time
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.SGD(self.model.parameters(),
                                        lr=self.train_params['learning_rate'])
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                        1.0,
                                                        gamma=0.95)
            for epoch in range(1, self.checkpoint_params["epoch"] + 1):
                #     optimizer.zero_grad()
                #     output = self.model(train_x)
                #     loss = criterion(output.view(-1, self.checkpoint_params["epoch"]), train_y)
                #     loss.backward()
                #     torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                #     optimizer.step()
                #
                # # load the best model so that it could be tested
                # self.load_model()
                #
                # self.__finish_train = True
                # model.train() # Turn on the train mode
                data, targets = get_batch(jr_data, epoch)
                total_loss = 0.
                start_time = time.time()
                optimizer.zero_grad()
                output = self.model(data)
                loss = criterion(output.view(-1, data), targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                optimizer.step()

                total_loss += loss.item()
                log_interval = 200
                # if batch % log_interval == 0 and batch > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | '
                      'lr {:02.2f} | ms/batch {:5.2f} | '
                      'loss {:5.2f} | ppl {:8.2f}'.format(
                          epoch, 1,
                          len(jr_data) // emb_dim,
                          scheduler.get_lr()[0], elapsed * 1000 / log_interval,
                          cur_loss, math.exp(cur_loss)))
Exemple #5
0
sub_root_dir = os.path.split(sub_sub_root_dir)[0]
root_dir = os.path.split(sub_root_dir)[0]

sys.path.append(sub_sub_root_dir)
sys.path.append(sub_root_dir)
sys.path.append(root_dir)

import shutil
from lib import utils
"""
Cleaning the useless model files and tensorboard files
"""

save_best_model_num = 1

model_dir = utils.create_dir_in_root('runtime', 'models')
tb_dir = utils.create_dir_in_root('runtime', 'tensorboard')
tokenizer_dir = utils.create_dir_in_root('runtime', 'tokenizer')

for model_name in os.listdir(model_dir):
    tmp_model_dir = os.path.join(model_dir, model_name)
    print(f'\nchecking {tmp_model_dir} ...')

    for _date in os.listdir(tmp_model_dir):
        date_dir = os.path.join(tmp_model_dir, _date)
        if not os.path.isdir(date_dir):
            continue

        model_list = os.listdir(date_dir)

        print(f'\tchecking {_date}')
Exemple #6
0
    def create_dir(self):
        # create tensorboard path
        self.tb_dir_word_translate = utils.create_dir_in_root(
            'runtime', 'tensorboard', self.name, 'word_translate', self.TIME)
        self.tb_dir_cdlm_translate = utils.create_dir_in_root(
            'runtime', 'tensorboard', self.name, 'cdlm_translate', self.TIME)
        self.tb_dir_cdlm_ner = utils.create_dir_in_root(
            'runtime', 'tensorboard', self.name, 'cdlm_ner', self.TIME)
        self.tb_dir_cdlm_pos = utils.create_dir_in_root(
            'runtime', 'tensorboard', self.name, 'cdlm_pos', self.TIME)
        self.tb_dir_cdlm_synonym = utils.create_dir_in_root(
            'runtime', 'tensorboard', self.name, 'cdlm_synonym', self.TIME)

        # create model path
        self.model_dir_word_translate = utils.create_dir_in_root(
            'runtime', 'models', self.name, 'word_translate', self.TIME)
        self.model_dir_cdlm_translate = utils.create_dir_in_root(
            'runtime', 'models', self.name, 'cdlm_translate', self.TIME)
        self.model_dir_cdlm_ner = utils.create_dir_in_root(
            'runtime', 'models', self.name, 'cdlm_ner', self.TIME)
        self.model_dir_cdlm_pos = utils.create_dir_in_root(
            'runtime', 'models', self.name, 'cdlm_pos', self.TIME)
        self.model_dir_cdlm_synonym = utils.create_dir_in_root(
            'runtime', 'models', self.name, 'cdlm_synonym', self.TIME)

        self.checkpoint_path_word_translate = os.path.join(
            self.model_dir_word_translate,
            self.name + self.checkpoint_params['extend_name'])
        self.checkpoint_path_cdlm_translate = os.path.join(
            self.model_dir_cdlm_translate,
            self.name + self.checkpoint_params['extend_name'])
        self.checkpoint_path_cdlm_ner = os.path.join(
            self.model_dir_cdlm_ner,
            self.name + self.checkpoint_params['extend_name'])
        self.checkpoint_path_cdlm_pos = os.path.join(
            self.model_dir_cdlm_pos,
            self.name + self.checkpoint_params['extend_name'])
        self.checkpoint_path_cdlm_synonym = os.path.join(
            self.model_dir_cdlm_synonym,
            self.name + self.checkpoint_params['extend_name'])

        self.tokenizer_dir = utils.create_dir_in_root('runtime', 'tokenizer',
                                                      self.name, self.TIME)