def __init__(self, encoder_params: AttributeDict, decoder_params: AttributeDict): super().__init__() self.d_model = encoder_params.embedding_dim self.n_head = encoder_params.n_head self.num_encoder_layers = encoder_params.num_encoder_layer self.num_decoder_layers = encoder_params.num_decoder_layer self.dim_feedforward = encoder_params.dim_feedforward self.dropout = encoder_params.dropout_prob self.device = encoder_params.get('device', 'cpu') self.max_seq_len = encoder_params.max_seq_len self.src_embedding = Embeddings(params=encoder_params) self.tgt_embedding = Embeddings(params=decoder_params) self.transfomrer = nn.Transformer( d_model=self.d_model, nhead=self.n_head, num_encoder_layers=self.num_encoder_layers, num_decoder_layers=self.num_decoder_layers, dim_feedforward=self.dim_feedforward, dropout=self.dropout) self.proj_vocab_layer = nn.Linear( in_features=self.d_model, out_features=decoder_params.vocab_size) self.apply(self._initailze)
def __init__(self, params: AttributeDict): super().__init__() # mandatory self.d_model = params.d_model self.num_heads = params.num_heads self.vocab_size = params.vocab_size # optional self.num_layers = params.get('num_layers', 6) self.dim_feed_forward = params.get('dim_feed_forward', 2048) self.dropout_prob = params.get('dropout_prob', 0.1) self.pe_dropout_prob = params.get('pe_dropout_prob', 0.1) self.activation = params.get('activation', 'relu') self.max_seq_len = params.get('max_seq_len', 512) self.device = params.get('device', 'cpu') self.embedding = nn.Embedding(self.vocab_size, self.d_model) self.positional_encoding = PositionalEncoding(self.d_model, self.pe_dropout_prob, self.max_seq_len) encoder = TransformerEncoderLayer( d_model=self.d_model, nhead=self.num_heads, dim_feedforward=self.dim_feed_forward, dropout=self.dropout_prob, activation=self.activation) # encoder will be cloned as much as num_layers norm = nn.LayerNorm(self.d_model) self.encoder_stack = _TransformerEncoder(encoder, self.num_layers, norm) self._init_parameter()
def __init__(self, encoder_params: AttributeDict): super().__init__() self.vocab_size = encoder_params.vocab_size self.embedding_dim = encoder_params.embedding_dim self.hidden_size = encoder_params.hidden_size self.bidirectional = encoder_params.get('bidirectional', False) self.num_layers = encoder_params.get('num_layers', 1) self.dropout_prob = encoder_params.get('dropout_prob', 0.0) self.device = encoder_params.get('device', 'cpu') self.embedding_lookup = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=PAD_TOKEN_ID) self.rnn = nn.GRU(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout_prob)
def __init__(self, device: str, common_params: AttributeDict): self.device = device self.common_params = common_params encoder_params = AttributeDict(self.common_params.encoder_params) decoder_params = AttributeDict(self.common_params.decoder_params) self.common_params.encoder_params = encoder_params self.common_params.decoder_params = decoder_params encoder_params.device = self.device decoder_params.device = self.device self.mode = None self.base_dir = os.getcwd() self.data_set_dir = os.path.join(self.base_dir, 'dataset') self.src_tokenizer = common_params.src_tokenizer() self.tgt_tokenizer = common_params.tgt_tokenizer() self.src_vocab_file_path = os.path.join( self.data_set_dir, common_params.src_vocab_filename) self.tgt_vocab_file_path = os.path.join( self.data_set_dir, common_params.tgt_vocab_filename) self.src_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('src_word_embedding_filename', None)) self.tgt_word_embedding_file_path = os.path.join( self.data_set_dir, common_params.get('tgt_word_embedding_filename', None)) self.src_word2id, self.src_id2word, self.src_embedding_weight = self._build_vocab( self.src_vocab_file_path, self.src_word_embedding_file_path, ) if encoder_params.get('vocab_size', None) is None: encoder_params.vocab_size = len(self.src_word2id) self.tgt_word2id, self.tgt_id2word, self.tgt_embedding_weight = self._build_vocab( self.tgt_vocab_file_path, self.tgt_word_embedding_file_path) if decoder_params.get('vocab_size', None) is None: decoder_params.vocab_size = len(self.tgt_word2id) self.model: nn.Module = self._build_model(self.common_params, self.device)
def __init__(self, decoder_params: AttributeDict): super().__init__() self.vocab_size = decoder_params.vocab_size self.embedding_dim = decoder_params.embedding_dim self.hidden_size = decoder_params.hidden_size self.max_seq_len = decoder_params.max_seq_len self.num_layers = decoder_params.get('num_layers', 1) self.dropout_prob = decoder_params.get('dropout_prob', 0.0) self.device = decoder_params.get('device', 'cpu') self.beam_size = decoder_params.get('beam_size', 1) self.embedding_lookup = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=PAD_TOKEN_ID) self.rnn = nn.GRU(input_size=self.embedding_dim, hidden_size=self.hidden_size, batch_first=True, bidirectional=False, num_layers=self.num_layers, dropout=self.dropout_prob) self.linear_transform = nn.Linear(self.hidden_size, self.vocab_size) self.decoder_output_func = nn.functional.log_softmax
def check_params(config: AttributeDict): assert isinstance(config.get('learning_rate'), float), \ 'learning_rate should be float value.' assert config.get('src_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('tgt_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('src_vocab_filename', None) is not None, \ 'src_vocab_filename must not be None' assert config.get('tgt_vocab_filename', None) is not None, \ 'tgt_vocab_filename must not be None' assert config.get('src_word_embedding_filename', None) is not None, \ 'src_word_embedding_filename must not be None' assert config.get('tgt_word_embedding_filename', None) is not None, \ 'tgt_word_embedding_filename must not be None' assert config.get('src_corpus_filename', None) is not None, \ 'src_corpus_filename must not be None' assert config.get('tgt_corpus_filename', None) is not None, \ 'tgt_corpus_filename must not be None' assert config.get('encoder', None) is not None, \ 'encoder should not be None' assert config.get('decoder', None) is not None, \ 'decoder should not be None'
def check_params(config: AttributeDict): assert config.get('src_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'src_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('tgt_tokenizer', '') in [ MecabTokenizer, NltkTokenizer ], 'tgt_tokenizer should be one of following [MecabTokenizer, NltkTokenizer]' assert config.get('src_vocab_filename', None) is not None, \ 'src_vocab_filename must not be None' assert config.get('tgt_vocab_filename', None) is not None, \ 'tgt_vocab_filename must not be None' assert config.get('src_word_embedding_filename', None) is not None, \ 'src_word_embedding_filename must not be None' assert config.get('tgt_word_embedding_filename', None) is not None, \ 'tgt_word_embedding_filename must not be None' assert config.get('src_corpus_filename', None) is not None, \ 'src_corpus_filename must not be None' assert config.get('tgt_corpus_filename', None) is not None, \ 'tgt_corpus_filename must not be None' assert config.get('encoder', None) is not None, \ 'encoder should not be None' assert config.get('decoder', None) is not None, \ 'decoder should not be None' assert config.get('checkpoint_path', None) is not None, \ 'model_path should not be None'