def __init__(self, encoder_params: AttributeDict, decoder_params: AttributeDict): super().__init__() self.d_model = encoder_params.embedding_dim self.n_head = encoder_params.n_head self.num_encoder_layers = encoder_params.num_encoder_layer self.num_decoder_layers = encoder_params.num_decoder_layer self.dim_feedforward = encoder_params.dim_feedforward self.dropout = encoder_params.dropout_prob self.device = encoder_params.get('device', 'cpu') self.max_seq_len = encoder_params.max_seq_len self.src_embedding = Embeddings(params=encoder_params) self.tgt_embedding = Embeddings(params=decoder_params) self.transfomrer = nn.Transformer( d_model=self.d_model, nhead=self.n_head, num_encoder_layers=self.num_encoder_layers, num_decoder_layers=self.num_decoder_layers, dim_feedforward=self.dim_feedforward, dropout=self.dropout) self.proj_vocab_layer = nn.Linear( in_features=self.d_model, out_features=decoder_params.vocab_size) self.apply(self._initailze)
def __init__(self, params: AttributeDict): super().__init__() # mandatory self.d_model = params.d_model self.num_heads = params.num_heads self.vocab_size = params.vocab_size # optional self.num_layers = params.get('num_layers', 6) self.dim_feed_forward = params.get('dim_feed_forward', 2048) self.dropout_prob = params.get('dropout_prob', 0.1) self.pe_dropout_prob = params.get('pe_dropout_prob', 0.1) self.activation = params.get('activation', 'relu') self.max_seq_len = params.get('max_seq_len', 512) self.device = params.get('device', 'cpu') self.embedding = nn.Embedding(self.vocab_size, self.d_model) self.positional_encoding = PositionalEncoding(self.d_model, self.pe_dropout_prob, self.max_seq_len) encoder = TransformerEncoderLayer( d_model=self.d_model, nhead=self.num_heads, dim_feedforward=self.dim_feed_forward, dropout=self.dropout_prob, activation=self.activation) # encoder will be cloned as much as num_layers norm = nn.LayerNorm(self.d_model) self.encoder_stack = _TransformerEncoder(encoder, self.num_layers, norm) self._init_parameter()
def main(): args = parse_args() setup_logger(args) checkpoint_path = 'checkpoint-{}'.format(args.name) os.makedirs(checkpoint_path, exist_ok=True) with open(args.train_path, 'rb') as f: train_data = AttributeDict(pickle.load(f)) with open(args.valid_path, 'rb') as f: valid_data = AttributeDict(pickle.load(f)) # always use training data dictionaries valid_data.e_to_index = train_data.e_to_index valid_data.index_to_e = train_data.index_to_e valid_data.r_to_index = train_data.r_to_index valid_data.index_to_r = train_data.index_to_r conv_e = ConvE(num_e=len(train_data.e_to_index), num_r=len(train_data.r_to_index)).cuda() criterion = StableBCELoss() optimizer = optim.Adam(conv_e.parameters(), lr=0.003) for epoch in trange(args.epochs): train(epoch, train_data, conv_e, criterion, optimizer, args) valid(epoch, train_data, conv_e, args.batch_size, 'train') valid(epoch, valid_data, conv_e, args.batch_size, 'valid') with open( '{}/checkpoint_{}.model'.format(checkpoint_path, str(epoch + 1).zfill(2)), 'wb') as f: torch.save(conv_e, f)
def __init__(self, encoder_params: AttributeDict): super().__init__() self.vocab_size = encoder_params.vocab_size self.embedding_dim = encoder_params.embedding_dim self.hidden_size = encoder_params.hidden_size self.bidirectional = encoder_params.get('bidirectional', False) self.num_layers = encoder_params.get('num_layers', 1) self.dropout_prob = encoder_params.get('dropout_prob', 0.0) self.device = encoder_params.get('device', 'cpu') self.embedding_lookup = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=PAD_TOKEN_ID) self.rnn = nn.GRU(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout_prob)
def train(self, train_params: AttributeDict, loss_func, optimizer): # Merge common and train params params = AttributeDict(self.common_params.copy()) params.update(train_params) self._set_mode(Estimator.Mode.TRAIN) encoder_params = params.encoder_params decoder_params = params.decoder_params src_corpus_file_path = os.path.join(self.data_set_dir, params.src_corpus_filename) tgt_corpus_file_path = os.path.join(self.data_set_dir, params.tgt_corpus_filename) data_loader = self._prepare_data_loader(src_corpus_file_path, tgt_corpus_file_path, params, encoder_params.max_seq_len, decoder_params.max_seq_len) epoch = 0 avg_loss = 0. for epoch in range(params.n_epochs): avg_loss = self._train_model(data_loader, params, self.model, loss_func, optimizer, self.device, epoch + 1) save_dir_path = os.path.join(train_params.model_save_directory, get_checkpoint_dir_path(epoch + 1)) if not os.path.exists(save_dir_path): os.makedirs(save_dir_path) # save checkpoint for last epoch torch.save( { 'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': avg_loss }, os.path.join(save_dir_path, 'checkpoint.tar'))
def __init__(self, decoder_params: AttributeDict): super().__init__() self.vocab_size = decoder_params.vocab_size self.embedding_dim = decoder_params.embedding_dim self.hidden_size = decoder_params.hidden_size self.max_seq_len = decoder_params.max_seq_len self.num_layers = decoder_params.get('num_layers', 1) self.dropout_prob = decoder_params.get('dropout_prob', 0.0) self.device = decoder_params.get('device', 'cpu') self.beam_size = decoder_params.get('beam_size', 1) self.embedding_lookup = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=PAD_TOKEN_ID) self.rnn = nn.GRU(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, bidirectional=False, num_layers=self.num_layers, dropout=self.dropout_prob) self.linear_transform = nn.Linear(self.hidden_size, self.vocab_size) self.decoder_output_func = nn.functional.log_softmax
def eval(self, eval_params: AttributeDict, loss_func): self._set_mode(Estimator.Mode.EVAL) params = AttributeDict(self.common_params.copy()) params.update(eval_params) encoder_params = params.encoder_params decoder_params = params.decoder_params # load checkpoint checkpoint = self._load_checkpoint(params) self.model.load_state_dict(checkpoint['model_state_dict']) src_corpus_file_path = os.path.join(self.data_set_dir, params.src_corpus_filename) tgt_corpus_file_path = os.path.join(self.data_set_dir, params.tgt_corpus_filename) data_loader = self._prepare_data_loader(src_corpus_file_path, tgt_corpus_file_path, params, encoder_params.max_seq_len, decoder_params.max_seq_len) avg_loss, bleu_score = self._eval_model(data_loader, params, self.model, loss_func, self.device, self.tgt_id2word) print(f'Avg loss: {avg_loss:05.3f}, BLEU score: {bleu_score}')
def preprocess_valid(train_path, valid_path): x, y = list(), list() with open(train_path, 'rb') as f: train_data = AttributeDict(pickle.load(f)) s_dict = read_data(valid_path) for s, ro in s_dict.items(): try: _ = train_data.e_to_index[s] except KeyError: continue for r, objects in ro.items(): try: _ = train_data.r_to_index[r] except KeyError: continue filtered_objects = list() for o in objects: # sometimes an entity only occurs as an object try: _ = train_data.e_to_index[o] filtered_objects.append(o) except KeyError: continue x.append((s, r)) y.append(filtered_objects) data = { 'x': x, 'y': y, } save_file_path = os.path.splitext(valid_path)[0] + '.pkl' pickle.dump(data, open(save_file_path, 'wb'))
def __init__(self, device: str, common_params: AttributeDict): self.device = device self.common_params = common_params encoder_params = AttributeDict(self.common_params.encoder_params) decoder_params = AttributeDict(self.common_params.decoder_params) self.common_params.encoder_params = encoder_params self.common_params.decoder_params = decoder_params encoder_params.device = self.device decoder_params.device = self.device self.mode = None self.base_dir = os.getcwd() self.data_set_dir = os.path.join(self.base_dir, 'dataset') self.src_tokenizer = common_params.src_tokenizer() self.tgt_tokenizer = common_params.tgt_tokenizer() self.src_vocab_file_path = os.path.join( self.data_set_dir, common_params.src_vocab_filename) self.tgt_vocab_file_path = os.path.join( self.data_set_dir, common_params.tgt_vocab_filename) self.src_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('src_word_embedding_filename', None)) self.tgt_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('tgt_word_embedding_filename', None)) self.src_word2id, self.src_id2word, self.src_embedding_weight = self._build_vocab( self.src_vocab_file_path, self.src_word_embedding_file_path, ) if encoder_params.get('vocab_size', None) is None: encoder_params.vocab_size = len(self.src_word2id) self.tgt_word2id, self.tgt_id2word, self.tgt_embedding_weight = self._build_vocab( self.tgt_vocab_file_path, self.tgt_word_embedding_file_path) if decoder_params.get('vocab_size', None) is None: decoder_params.vocab_size = len(self.tgt_word2id) self.model: nn.Module = self._build_model(self.common_params, self.device)
def check_params(config: AttributeDict): assert isinstance(config.get('learning_rate'), float), \ 'learning_rate should be float value.' assert config.get('src_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('tgt_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('src_vocab_filename', None) is not None, \ 'src_vocab_filename must not be None' assert config.get('tgt_vocab_filename', None) is not None, \ 'tgt_vocab_filename must not be None' assert config.get('src_word_embedding_filename', None) is not None, \ 'src_word_embedding_filename must not be None' assert config.get('tgt_word_embedding_filename', None) is not None, \ 'tgt_word_embedding_filename must not be None' assert config.get('src_corpus_filename', None) is not None, \ 'src_corpus_filename must not be None' assert config.get('tgt_corpus_filename', None) is not None, \ 'tgt_corpus_filename must not be None' assert config.get('encoder', None) is not None, \ 'encoder should not be None' assert config.get('decoder', None) is not None, \ 'decoder should not be None'
def check_params(config: AttributeDict): assert config.get('src_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('tgt_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('src_vocab_filename', None) is not None, \ 'src_vocab_filename must not be None' assert config.get('tgt_vocab_filename', None) is not None, \ 'tgt_vocab_filename must not be None' assert config.get('src_word_embedding_filename', None) is not None, \ 'src_word_embedding_filename must not be None' assert config.get('tgt_word_embedding_filename', None) is not None, \ 'tgt_word_embedding_filename must not be None' assert config.get('src_corpus_filename', None) is not None, \ 'src_corpus_filename must not be None' assert config.get('tgt_corpus_filename', None) is not None, \ 'tgt_corpus_filename must not be None' assert config.get('encoder', None) is not None, \ 'encoder should not be None' assert config.get('decoder', None) is not None, \ 'decoder should not be None' assert config.get('checkpoint_path', None) is not None, \ 'model_path should not be None'
from __future__ import print_function from __future__ import unicode_literals from module import GruEncoder, GruDecoder from module.transformer import Transformer from module.tokenizer import NltkTokenizer from util import AttributeDict train_params = AttributeDict({ "n_epochs": 5, "batch_size": 64, "learning_rate": 1e-4, "src_tokenizer": NltkTokenizer, "tgt_tokenizer": NltkTokenizer, "src_vocab_filename": "src_vocab.txt", "src_word_embedding_filename": "src_word_embedding.npy", "tgt_vocab_filename": "tgt_vocab.txt", "tgt_word_embedding_filename": "tgt_word_embedding.npy", "src_corpus_filename": "korean-english-park.train.ko", "tgt_corpus_filename": "korean-english-park.train.en", "encoder": Transformer, "decoder": Transformer, "model_save_directory": "kor2eng-gru-gru" }) eval_params = AttributeDict({ "batch_size": 64, "src_tokenizer": NltkTokenizer, "tgt_tokenizer": NltkTokenizer,
common_params = AttributeDict({ "model": Seq2Seq, "src_tokenizer": MecabTokenizer, "tgt_tokenizer": MecabTokenizer, "src_vocab_filename": "kor-mecab-fasttext", "tgt_vocab_filename": "eng-mecab-fasttext", "src_word_embedding_filename": "kor-mecab-fasttext-512d.npy", "tgt_word_embedding_filename": "eng-mecab-fasttext-512d.npy", "encoder_params": { "model": TransformerEncoder, "embedding_dim": 512, "d_model": 512, "num_heads": 8, "num_layers": 6, "dim_feed_forward": 2048, "pe_dropout_prob": 0.1, "dropout_prob": 0.1, "activation": "relu", "max_seq_len": 128, }, "decoder_params": { "model": TransformerDecoder, "embedding_dim": 512, "d_model": 512, "num_heads": 8, "num_layers": 6, "dim_feed_forward": 2048, "pe_dropout_prob": 0.1, "dropout_prob": 0.1, "activation": "relu", "max_seq_len": 128, } })