def initialize_model(opt, src, tgt, train): # build vocabulary src.build_vocab(train.dataset, max_size=opt.src_vocab) tgt.build_vocab(train.dataset, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), opt.max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), opt.max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, attention_method=opt.attention_method, full_focus=opt.full_focus, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) return seq2seq, input_vocab, output_vocab
def initialize_model(parameters, src, tgt, train): # build vocabulary src.build_vocab(train.dataset, max_size=50000) tgt.build_vocab(train.dataset, max_size=50000) output_vocab = tgt.vocab # Initialize model hidden_size = parameters['hidden_size'] encoder = EncoderRNN(len(src.vocab), parameters['max_len'], hidden_size, parameters['embedding_size'], rnn_cell=parameters['rnn_cell'], variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), parameters['max_len'], hidden_size, attention_method=parameters['attention_method'], full_focus=parameters['full_focus'], rnn_cell=parameters['rnn_cell'], eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) seq2seq.to(device) for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) return seq2seq, output_vocab
def get_baseline_model(src, tgt, max_len=50, hidden_size=50, embedding_size=100): # Initialize model encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, embedding_size, rnn_cell='gru') decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size, rnn_cell='gru', eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) # # initialize weights # for param in seq2seq.parameters(): # param.data.uniform_(-0.08, 0.08) return seq2seq
def initialize_model(opt, src, tgt, train): # build vocabulary src.build_vocab(train.dataset, max_size=opt.src_vocab) tgt.build_vocab(train.dataset, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), opt.max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), opt.max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, attention_method=opt.attention_method, full_focus=opt.full_focus, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq = Seq2seq(encoder, decoder) # This enables using all GPUs available if torch.cuda.device_count() > 1: logging.info("Using {} GPUs".format(torch.cuda.device_count())) seq2seq = torch.nn.DataParallel(seq2seq) seq2seq.to(device) return seq2seq, input_vocab, output_vocab
else: # build vocabulary src.build_vocab(train, max_size=opt.src_vocab) tgt.build_vocab(train, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, use_attention=opt.attention, attention_method=opt.attention_method, full_focus=opt.full_focus, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id,
def initialize_model(opt, src, tgt, train): # build vocabulary src.build_vocab(train.dataset, max_size=opt.src_vocab) tgt.build_vocab(train.dataset, max_size=opt.tgt_vocab) input_vocab = src.vocab output_vocab = tgt.vocab # Initialize model hidden_size = opt.hidden_size decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size encoder = EncoderRNN(len(src.vocab), opt.max_len, hidden_size, opt.embedding_size, dropout_p=opt.dropout_p_encoder, n_layers=opt.n_layers, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, variable_lengths=True) decoder = DecoderRNN( len(tgt.vocab), opt.max_len, decoder_hidden_size, dropout_p=opt.dropout_p_decoder, n_layers=opt.n_layers, use_attention=opt.attention, attention_method=opt.attention_method, use_positional_attention=opt.positional_attention, bidirectional=opt.bidirectional, rnn_cell=opt.rnn_cell, eos_id=tgt.eos_id, sos_id=tgt.sos_id, positioning_generator_size=opt.positioning_generator_size, attention_mixer=opt.attention_mixer) # initialize weights using uniform distribution def uniform_weights_init(m): if isinstance(m, nn.LSTM): for name, param in m.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.uniform_(param, -opt.param_init, opt.param_init) if isinstance(m, nn.Linear) or isinstance(m, nn.Embedding): nn.init.uniform_(m.weight, -opt.param_init, opt.param_init) if opt.param_init > 0.0: encoder.apply(uniform_weights_init) decoder.apply(uniform_weights_init) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.device_count() > 1: logging.info("Using {} GPUs".format(torch.cuda.device_count())) seq2seq = nn.DataParallel(seq2seq) # xavier initialization if flag if opt.param_init_glorot: for p in seq2seq.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) seq2seq.to(device) return seq2seq, input_vocab, output_vocab