def __init__(self, generator, criterion, input_size, output_size, trg_word_emb=None, bowMapper=None, loss_norm='tokens', bow_crit=None, emb_crit=None, rl_crit=None, chunk_size=5, device_ids=None): super(MultiGPULossCompute, self).__init__() if bow_crit is not None or emb_crit is not None: wlog('using the bow loss') self.init_lambda, self.max_lambda, self.warmup_steps = 0., 1., 20000 self.lambda_step = (self.max_lambda - self.init_lambda) / self.warmup_steps self.decay_factor = self.max_lambda * (self.warmup_steps**0.5) self.loss_norm = loss_norm self.chunk_size = chunk_size # ?? self.device_ids = device_ids self.rl_crit_single = rl_crit # self.trg_word_emb = trg_word_emb.we self.output_size = output_size self.generator = generator self.criterion = criterion self.bowMapper = nn.parallel.replicate( bowMapper, devices=self.device_ids) if bowMapper is not None else None
def __init__(self, output_size): super(BOWLossCriterion, self).__init__() weight = tc.ones(output_size, requires_grad=False) weight[PAD] = 0 # do not predict padding, same with ingore_index self.crit = nn.NLLLoss(weight, ignore_index=PAD, reduction='sum') wlog('using the bag of words loss') self.sigmoid = nn.Sigmoid()
def __init__(self, dropout_prob, n_embed, max_len=MAX_SEQ_SIZE): super(PositionalEncoding, self).__init__() # Compute the positional encodings once in log space. pe = tc.zeros(max_len, n_embed) position = tc.arange(0., max_len).unsqueeze(1) div_term = tc.exp( tc.arange(0., n_embed, 2) * -(math.log(10000.0) / n_embed)) # keep dim 0 for padding token position encoding zero vector #inter_term = position.float() * div_term #pe[1:, 0::2] = tc.sin(inter_term)[1:] #pe[1:, 1::2] = tc.cos(inter_term)[1:] # [5000, 1] * [256] = [5000, 256] pe[:, 0::2] = tc.sin(position * div_term) pe[:, 1::2] = tc.cos(position * div_term) pe = pe.unsqueeze(0) # [5000, 512] -> [1, 5000, 512] self.register_buffer('pe', pe) self.n_embed = n_embed wlog('\t pe: {}'.format(pe.size())) self.dropout = None if dropout_prob is not None and 0. < dropout_prob <= 1.0: wlog('\t with emb dropout prob = {} ...'.format(dropout_prob)) self.dropout = nn.Dropout(p=dropout_prob)
def build_decoder(trg_emb): if wargs.encoder_type == 'gru': from models.gru_decoder import StackedGRUDecoder return StackedGRUDecoder(trg_emb=trg_emb, enc_hid_size=wargs.d_enc_hid, dec_hid_size=wargs.d_dec_hid, n_layers=wargs.n_dec_layers, attention_type=wargs.attention_type, rnn_dropout_prob=wargs.rnn_dropout, out_dropout_prob=wargs.output_dropout) if wargs.decoder_type == 'att': from models.self_att_model import SelfAttDecoder, SelfAttDecoderLayer, \ PositionwiseFeedForward, clones from models.attention import MultiHeadedAttention c = copy.deepcopy attn = MultiHeadedAttention(h=wargs.n_head, d_model=wargs.d_model, dropout=wargs.att_dropout) wlog('clones -> {}'.format(2)) ff = PositionwiseFeedForward(d_model=wargs.d_model, d_ff=wargs.d_ff_filter, dropout=wargs.relu_dropout) return SelfAttDecoder(trg_emb=trg_emb, layer=SelfAttDecoderLayer( wargs.d_model, c(attn), c(attn), c(ff), dropout=wargs.residual_dropout), N=wargs.n_enc_layers)
def __init__(self, features, eps=1e-5): super(LayerNorm, self).__init__() self.a_2 = nn.Parameter(tc.ones(features)) wlog('*Ones init a in layernorm {}'.format(self.a_2.size())) self.b_2 = nn.Parameter(tc.zeros(features)) wlog('*Zeros init b in layernorm {}'.format(self.b_2.size())) self.eps = eps
def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) wlog('*Xavier init linear weight {}'.format(m.weight.size())) if bias is True: nn.init.constant_(m.bias, 0.) wlog('*Zeros init linear bias {}'.format(m.bias.size())) return m
def __init__(self, output_size, label_smoothing=0.1): super(LabelSmoothingCriterion, self).__init__() assert 0. < label_smoothing <= 1., 'label smoothing value should be in [0, 1]' wlog('NLL loss with label_smoothing: {}'.format(label_smoothing)) # all non-true labels are uniformly set to low-confidence self.smoothing_value = label_smoothing / (output_size - 2) one_hot = tc.full((output_size, ), self.smoothing_value) one_hot[PAD] = 0. self.register_buffer('one_hot', one_hot.unsqueeze(0)) self.confidence = 1.0 - label_smoothing self.label_smoothing = label_smoothing
def __init__(self, input_size, output_size, trg_word_emb=None, loss_norm='tokens', label_smoothing=0., emb_loss=False, bow_loss=False): super(Classifier, self).__init__() if emb_loss is True: assert trg_word_emb is not None, 'embedding loss needs target embedding' self.trg_word_emb = trg_word_emb.we self.euclidean_dist = nn.PairwiseDistance(p=2, eps=1e-06, keepdim=True) self.emb_loss = emb_loss if bow_loss is True: wlog('using the bag of words loss') self.sigmoid = nn.Sigmoid() #self.ctx_map_vocab = Linear(2 * input_size, output_size, bias=True) #self.softmax = MaskSoftmax() self.bow_loss = bow_loss self.map_vocab = Linear(input_size, output_size, bias=True) nn.init.normal_(self.map_vocab.weight, mean=0, std=input_size**-0.5) if wargs.proj_share_weight is True: assert input_size == wargs.d_trg_emb wlog('copying weights of target word embedding into classifier') self.map_vocab.weight = trg_word_emb.we.weight self.log_prob = MyLogSoftmax(wargs.self_norm_alpha) assert 0. <= label_smoothing <= 1., 'label smoothing value should be in [0, 1]' wlog('NLL loss with label_smoothing: {}'.format(label_smoothing)) if label_smoothing == 0. or self.bow_loss is True: weight = tc.ones(output_size, requires_grad=False) weight[PAD] = 0 # do not predict padding, same with ingore_index self.criterion = nn.NLLLoss(weight, ignore_index=PAD, reduction='sum') #self.criterion = nn.NLLLoss(weight, ignore_index=PAD, size_average=False) if 0. < label_smoothing <= 1.: # all non-true labels are uniformly set to low-confidence self.smoothing_value = label_smoothing / (output_size - 2) one_hot = tc.full((output_size, ), self.smoothing_value) one_hot[PAD] = 0. self.register_buffer('one_hot', one_hot.unsqueeze(0)) self.confidence = 1.0 - label_smoothing self.output_size = output_size self.softmax = MaskSoftmax() self.loss_norm = loss_norm self.label_smoothing = label_smoothing
def __init__(self, n_vocab, n_embed=512, emb_dropout=0., position_encoding=False, prefix='WordEmbedding'): super(WordEmbedding, self).__init__() wlog('WordEmbedding_{}'.format(prefix)) self.position_encoding = position_encoding self.we = nn.Embedding(n_vocab, n_embed, padding_idx=PAD) nn.init.normal_(self.we.weight, mean=0, std=n_embed**-0.5) nn.init.constant_(self.we.weight[PAD], 0) self.n_embed = n_embed if position_encoding is True: wlog('with position emb ...') #self.pe = PositionalEncoding(emb_dropout, n_embed) #self.spe = PositionalEmbedding(MAX_SEQ_SIZE, n_embed, PAD, left_pad=False, learned=False) wlog('with emb dropout prob = {} ...'.format(emb_dropout)) self.emb_dropout = emb_dropout
def __init__(self, n_vocab, n_embed=512, emb_dropout=0., position_encoding=False, prefix='WordEmbedding'): super(WordEmbedding, self).__init__() wlog('WordEmbedding_{}'.format(prefix)) self.position_encoding = position_encoding self.we = nn.Embedding(n_vocab, n_embed, padding_idx=PAD) self.n_vocab = n_vocab nn.init.normal_(self.we.weight, mean=0, std=n_embed**-0.5) wlog('*Normal init word embedding weight {}'.format( self.we.weight.size())) nn.init.constant_(self.we.weight[PAD], 0) self.n_embed = n_embed if position_encoding is True: wlog('with position emb ...') self.spe = PositionalEncoding(emb_dropout, n_embed) self.emb_dropout = emb_dropout
def domain_out(model, src_input): batch_count = len(src_input) point_every, number_every = int(math.ceil(batch_count / 100)), int( math.ceil(batch_count / 10)) total_domain = [] sent_no, words_cnt = 0, 0 fd_attent_matrixs, trgs = None, None trans_start = time.time() for bid in range(batch_count): src, srcm = src_input[bid][1], src_input[bid][4] domain = model(src, None, srcm, None, 'IN') total_domain.append(domain) if numpy.mod(sent_no + 1, point_every) == 0: wlog('.', False) if numpy.mod(sent_no + 1, number_every) == 0: wlog('{}'.format(sent_no + 1), False) sent_no += 1 wlog('Done ...') return total_domain
from_pretrain_model = True train_en_de_model = False rl_loss = False rl_loss_lambda = 0.1 task = 'wenmt' # task = 'zh_en_nmt' # task = 'back_trans_nmt' # task = 'en_de_nmt' def getdir(): return os.path.abspath('.') + '/' wlog('work dir change:{}'.format(os.chdir(sys.path[0] + '/../'))) wlog('chdir then dir{}'.format(os.getcwd())) wlog('working on {}'.format(getdir())) ''' directory to save model, validation output and test output ''' work_dir = getdir() word2vec_weight = work_dir + 'w2v_embedding.pt' ''' reinforce CE ''' use_reinfore_ce = True # standard ce exp0 ''' 1: reinforce ce. 2: reinforce auto ml ce 2|v| vector . 3: small matrix auto ml ce. 4. use word2vec embedding for distance 5 cat the embedding then map to a scalar''' reinfore_type = 4 if use_reinfore_ce else 0 from datetime import datetime TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) illustration = 'training_ende_model_exp6_withsmooth'
def __init__(self, trg_emb, n_layers=6, d_model=512, n_head=8, d_ff_filter=1024, att_dropout=0.3, residual_dropout=0., relu_dropout=0., self_attn_type='scaled-dot', proj_share_weight=False, decoder_normalize_before=False): wlog('Transformer decoder ========================= ') wlog('\ttrg_word_emb: {}'.format(trg_emb.we.weight.size())) wlog('\tn_layers: {}'.format(n_layers)) wlog('\tn_head: {}'.format(n_head)) wlog('\td_model: {}'.format(d_model)) wlog('\td_ffn_filter: {}'.format(d_ff_filter)) wlog('\tatt_dropout: {}'.format(att_dropout)) wlog('\tresidual_dropout: {}'.format(residual_dropout)) wlog('\trelu_dropout: {}'.format(relu_dropout)) wlog('\tproj_share_weight: {}'.format(proj_share_weight)) super(SelfAttDecoder, self).__init__() self.layer_stack = nn.ModuleList([ SelfAttDecoderLayer(d_model, n_head, d_ff_filter, att_dropout=att_dropout, residual_dropout=residual_dropout, relu_dropout=relu_dropout, self_attn_type=self_attn_type, decoder_normalize_before=decoder_normalize_before) for _ in range(n_layers)]) self.trg_word_emb = trg_emb if decoder_normalize_before is True: self.layer_norm = nn.LayerNorm(d_model, elementwise_affine=True) self.decoder_normalize_before = decoder_normalize_before
def main(): #if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample' init_dir(wargs.dir_model) init_dir(wargs.dir_valid) src = os.path.join(wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_src_suffix)) trg = os.path.join(wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_trg_suffix)) vocabs = {} wlog('\nPreparing source vocabulary from {} ... '.format(src)) src_vocab = extract_vocab(src, wargs.src_vcb, wargs.n_src_vcb_plan, wargs.max_seq_len, char=wargs.src_char) wlog('\nPreparing target vocabulary from {} ... '.format(trg)) trg_vocab = extract_vocab(trg, wargs.trg_vcb, wargs.n_trg_vcb_plan, wargs.max_seq_len) n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format(n_src_vcb, n_trg_vcb)) vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab wlog('\nPreparing training set from {} and {} ... '.format(src, trg)) trains = {} train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data, wargs.train_prefix, wargs.train_src_suffix, wargs.train_trg_suffix, src_vocab, trg_vocab, shuffle=True, sort_k_batches=wargs.sort_k_batches, max_seq_len=wargs.max_seq_len, char=wargs.src_char) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size, batch_type=wargs.batch_type, bow=wargs.trg_bow, batch_sort=False) wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst))) batch_valid = None if wargs.val_prefix is not None: val_src_file = os.path.join(wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_src_suffix)) val_trg_file = os.path.join(wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_ref_suffix)) wlog('\nPreparing validation set from {} and {} ... '.format(val_src_file, val_trg_file)) valid_src_tlst, valid_trg_tlst = wrap_data(wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix, wargs.val_ref_suffix, src_vocab, trg_vocab, shuffle=False, max_seq_len=wargs.dev_max_seq_len, char=wargs.src_char) batch_valid = Input(valid_src_tlst, valid_trg_tlst, 1, batch_sort=False) batch_tests = None if wargs.tests_prefix is not None: assert isinstance(wargs.tests_prefix, list), 'Test files should be list.' init_dir(wargs.dir_tests) batch_tests = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) wlog('\nPreparing test set from {} ... '.format(test_file)) test_src_tlst, _ = wrap_tst_data(test_file, src_vocab, char=wargs.src_char) batch_tests[prefix] = Input(test_src_tlst, None, 1, batch_sort=False) wlog('\n## Finish to Prepare Dataset ! ##\n') src_emb = WordEmbedding(n_src_vcb, wargs.d_src_emb, wargs.input_dropout, wargs.position_encoding, prefix='Src') trg_emb = WordEmbedding(n_trg_vcb, wargs.d_trg_emb, wargs.input_dropout, wargs.position_encoding, prefix='Trg') # share the embedding matrix - preprocess with share_vocab required. if wargs.embs_share_weight: if n_src_vcb != n_trg_vcb: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') src_emb.we.weight = trg_emb.we.weight nmtModel = build_NMT(src_emb, trg_emb) if not wargs.copy_attn: classifier = Classifier(wargs.d_model if wargs.decoder_type == 'att' else 2 * wargs.d_enc_hid, n_trg_vcb, trg_emb, loss_norm=wargs.loss_norm, label_smoothing=wargs.label_smoothing, emb_loss=wargs.emb_loss, bow_loss=wargs.bow_loss) nmtModel.decoder.classifier = classifier if wargs.gpu_id is not None: wlog('push model onto GPU {} ... '.format(wargs.gpu_id), 0) #nmtModel = nn.DataParallel(nmtModel, device_ids=wargs.gpu_id) nmtModel.to(tc.device('cuda')) else: wlog('push model onto CPU ... ', 0) nmtModel.to(tc.device('cpu')) wlog('done.') if wargs.pre_train is not None: assert os.path.exists(wargs.pre_train) from tools.utils import load_model _dict = load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 5: model_dict, class_dict, eid, bid, optim = _dict elif len(_dict) == 4: model_dict, eid, bid, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, init_D=wargs.param_init_D, a=float(wargs.u_gain)) wargs.start_epoch = eid + 1 else: optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm) #for n, p in nmtModel.named_parameters(): # bias can not be initialized uniformly #if wargs.encoder_type != 'att' and wargs.decoder_type != 'att': # init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain)) wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('parameters number: {}/{}'.format(pcnt1, pcnt2)) wlog('\n' + '*' * 30 + ' trainable parameters ' + '*' * 30) for n, p in nmtModel.named_parameters(): if p.requires_grad: wlog('{:60} : {}'.format(n, p.size())) optim.init_optimizer(nmtModel.parameters()) trainer = Trainer(nmtModel, batch_train, vocabs, optim, batch_valid, batch_tests) trainer.train()
def main(): # Check if CUDA is available if cuda.is_available(): wlog( 'CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])' ) else: wlog('Warning: CUDA is not available, try CPU') if wargs.gpu_id: cuda.set_device(wargs.gpu_id[0]) wlog('Using GPU {}'.format(wargs.gpu_id[0])) init_dir(wargs.dir_model) init_dir(wargs.dir_valid) ''' train_srcD_file = wargs.dir_data + 'train.10k.zh5' wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file)) src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size) train_trgD_file = wargs.dir_data + 'train.10k.en5' wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file)) trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size) train_src_file = wargs.dir_data + 'train.10k.zh0' train_trg_file = wargs.dir_data + 'train.10k.en0' wlog('\nPreparing training set from {} and {} ... '.format(train_src_file, train_trg_file)) train_src_tlst, train_trg_tlst = wrap_data(train_src_file, train_trg_file, src_vocab, trg_vocab) #list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...], no padding wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst))) src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size)) batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size) ''' src = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_src_suffix)) trg = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_trg_suffix)) vocabs = {} wlog('\nPreparing source vocabulary from {} ... '.format(src)) src_vocab = extract_vocab(src, wargs.src_dict, wargs.src_dict_size) wlog('\nPreparing target vocabulary from {} ... '.format(trg)) trg_vocab = extract_vocab(trg, wargs.trg_dict, wargs.trg_dict_size) src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format( src_vocab_size, trg_vocab_size)) vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab wlog('\nPreparing training set from {} and {} ... '.format(src, trg)) trains = {} train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data, wargs.train_prefix, wargs.train_src_suffix, wargs.train_trg_suffix, src_vocab, trg_vocab, max_seq_len=wargs.max_seq_len) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size, batch_sort=True) wlog('Sentence-pairs count in training data: {}'.format( len(train_src_tlst))) batch_valid = None if wargs.val_prefix is not None: val_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix) val_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_ref_suffix) wlog('\nPreparing validation set from {} and {} ... '.format( val_src_file, val_trg_file)) valid_src_tlst, valid_trg_tlst = wrap_data( wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix, wargs.val_ref_suffix, src_vocab, trg_vocab, shuffle=False, sort_data=False, max_seq_len=wargs.dev_max_seq_len) batch_valid = Input(valid_src_tlst, valid_trg_tlst, 1, volatile=True, batch_sort=False) batch_tests = None if wargs.tests_prefix is not None: assert isinstance(wargs.tests_prefix, list), 'Test files should be list.' init_dir(wargs.dir_tests) batch_tests = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) wlog('\nPreparing test set from {} ... '.format(test_file)) test_src_tlst, _ = wrap_tst_data(test_file, src_vocab) batch_tests[prefix] = Input(test_src_tlst, None, 1, volatile=True, batch_sort=False) wlog('\n## Finish to Prepare Dataset ! ##\n') nmtModel = NMT(src_vocab_size, trg_vocab_size) if wargs.pre_train is not None: assert os.path.exists(wargs.pre_train) _dict = _load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 4: model_dict, eid, bid, optim = _dict elif len(_dict) == 5: model_dict, class_dict, eid, bid, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, True) wargs.start_epoch = eid + 1 else: for n, p in nmtModel.named_parameters(): init_params(p, n, True) optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm, learning_rate_decay=wargs.learning_rate_decay, start_decay_from=wargs.start_decay_from, last_valid_bleu=wargs.last_valid_bleu) optim.init_optimizer(nmtModel.parameters()) if wargs.gpu_id: wlog('Push model onto GPU {} ... '.format(wargs.gpu_id), 0) nmtModel.cuda() else: wlog('Push model onto CPU ... ', 0) nmtModel.cpu() wlog('done.') wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2)) trainer = Trainer(nmtModel, src_vocab.idx2key, trg_vocab.idx2key, optim, trg_vocab_size, valid_data=batch_valid, tests_data=batch_tests) # add 1000 to train train_all_chunks = (train_src_tlst, train_trg_tlst) dh = DataHisto(train_all_chunks) ''' dev_src0 = wargs.dir_data + 'dev.1k.zh0' dev_trg0 = wargs.dir_data + 'dev.1k.en0' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src0, dev_trg0)) dev_src0, dev_trg0 = wrap_data(dev_src0, dev_trg0, src_vocab, trg_vocab) wlog(len(train_src_tlst)) dev_src1 = wargs.dir_data + 'dev.1k.zh1' dev_trg1 = wargs.dir_data + 'dev.1k.en1' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src1, dev_trg1)) dev_src1, dev_trg1 = wrap_data(dev_src1, dev_trg1, src_vocab, trg_vocab) dev_src2 = wargs.dir_data + 'dev.1k.zh2' dev_trg2 = wargs.dir_data + 'dev.1k.en2' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src2, dev_trg2)) dev_src2, dev_trg2 = wrap_data(dev_src2, dev_trg2, src_vocab, trg_vocab) dev_src3 = wargs.dir_data + 'dev.1k.zh3' dev_trg3 = wargs.dir_data + 'dev.1k.en3' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src3, dev_trg3)) dev_src3, dev_trg3 = wrap_data(dev_src3, dev_trg3, src_vocab, trg_vocab) dev_src4 = wargs.dir_data + 'dev.1k.zh4' dev_trg4 = wargs.dir_data + 'dev.1k.en4' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src4, dev_trg4)) dev_src4, dev_trg4 = wrap_data(dev_src4, dev_trg4, src_vocab, trg_vocab) wlog(len(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0)) batch_dev = Input(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0, dev_trg4+dev_trg3+dev_trg2+dev_trg1+dev_trg0, wargs.batch_size) ''' batch_dev = None assert wargs.dev_prefix is not None, 'Requires development to tuning.' dev_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.dev_prefix, wargs.val_src_suffix) dev_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.dev_prefix, wargs.val_ref_suffix) wlog('\nPreparing dev set from {} and {} ... '.format( dev_src_file, dev_trg_file)) valid_src_tlst, valid_trg_tlst = wrap_data( wargs.val_tst_dir, wargs.dev_prefix, wargs.val_src_suffix, wargs.val_ref_suffix, src_vocab, trg_vocab, shuffle=True, sort_data=True, max_seq_len=wargs.dev_max_seq_len) batch_dev = Input(valid_src_tlst, valid_trg_tlst, wargs.batch_size, batch_sort=True) trainer.train(dh, batch_dev, 0, merge=True, name='DH_{}'.format('dev')) '''
def main(): init_dir(wargs.dir_model) init_dir(wargs.dir_valid) vocab_data = {} train_srcD_file = wargs.src_vocab_from wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file)) src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size) vocab_data['src'] = src_vocab train_trgD_file = wargs.trg_vocab_from wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file)) trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size) vocab_data['trg'] = trg_vocab train_src_file = wargs.train_src train_trg_file = wargs.train_trg wlog('\nPreparing training set from {} and {} ... '.format( train_src_file, train_trg_file)) train_src_tlst, train_trg_tlst = wrap_data(train_src_file, train_trg_file, src_vocab, trg_vocab, max_seq_len=wargs.max_seq_len) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' ''' devs = {} dev_src = wargs.val_tst_dir + wargs.val_prefix + '.src' dev_trg = wargs.val_tst_dir + wargs.val_prefix + '.ref0' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src, dev_trg)) dev_src, dev_trg = wrap_data(dev_src, dev_trg, src_vocab, trg_vocab) devs['src'], devs['trg'] = dev_src, dev_trg ''' valid_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix) wlog('\nPreparing validation set from {} ... '.format(valid_file)) valid_src_tlst, valid_src_lens = val_wrap_data(valid_file, src_vocab) wlog('Sentence-pairs count in training data: {}'.format( len(train_src_tlst))) src_vocab_size, trg_vocab_size = vocab_data['src'].size( ), vocab_data['trg'].size() wlog('Vocabulary size: |source|={}, |target|={}'.format( src_vocab_size, trg_vocab_size)) batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size) batch_valid = Input(valid_src_tlst, None, 1, volatile=True) tests_data = None if wargs.tests_prefix is not None: init_dir(wargs.dir_tests) tests_data = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) wlog('Preparing test set from {} ... '.format(test_file)) test_src_tlst, _ = val_wrap_data(test_file, src_vocab) tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True) ''' # lookup_table on cpu to save memory src_lookup_table = nn.Embedding(wargs.src_dict_size + 4, wargs.src_wemb_size, padding_idx=utils.PAD).cpu() trg_lookup_table = nn.Embedding(wargs.trg_dict_size + 4, wargs.trg_wemb_size, padding_idx=utils.PAD).cpu() wlog('Lookup table on CPU ... ') wlog(src_lookup_table) wlog(trg_lookup_table) ''' sv = vocab_data['src'].idx2key tv = vocab_data['trg'].idx2key nmtModel = NMT(src_vocab_size, trg_vocab_size) #classifier = Classifier(wargs.out_size, trg_vocab_size, # nmtModel.decoder.trg_lookup_table if wargs.copy_trg_emb is True else None) if wargs.pre_train: assert os.path.exists(wargs.pre_train) _dict = _load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 4: model_dict, eid, bid, optim = _dict elif len(_dict) == 5: model_dict, class_dict, eid, bid, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, True) wargs.start_epoch = eid + 1 #tor = Translator(nmtModel, sv, tv) #tor.trans_tests(tests_data, eid, bid) else: for n, p in nmtModel.named_parameters(): init_params(p, n, True) #for n, p in classifier.named_parameters(): init_params(p, n, True) optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm, learning_rate_decay=wargs.learning_rate_decay, start_decay_from=wargs.start_decay_from, last_valid_bleu=wargs.last_valid_bleu) if wargs.gpu_id: nmtModel.cuda() #classifier.cuda() wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0])) else: nmtModel.cpu() #classifier.cpu() wlog('Push model onto CPU ... ') #nmtModel.classifier = classifier #nmtModel.decoder.map_vocab = classifier.map_vocab ''' nmtModel.src_lookup_table = src_lookup_table nmtModel.trg_lookup_table = trg_lookup_table print nmtModel.src_lookup_table.weight.data.is_cuda nmtModel.classifier.init_weights(nmtModel.trg_lookup_table) ''' wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2)) optim.init_optimizer(nmtModel.parameters()) #tor = Translator(nmtModel, sv, tv, wargs.search_mode) #tor.trans_tests(tests_data, pre_dict['epoch'], pre_dict['batch']) trainer = Trainer(nmtModel, batch_train, vocab_data, optim, batch_valid, tests_data) trainer.train()
import torch as tc from torch import cuda import wargs from tools.inputs_handler import * from tools.inputs import Input from tools.optimizer import Optim from models.losser import Classifier from models.embedding import WordEmbedding from models.model_builder import build_NMT from tools.utils import init_dir, wlog # Check if CUDA is available if cuda.is_available(): wlog('CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[0, 1, 2])') else: wlog('Warning: CUDA is not available, train on CPU') if wargs.gpu_id is not None: #cuda.set_device(wargs.gpu_id[0]) device = tc.device('cuda:{}'.format(wargs.gpu_id[0]) if cuda.is_available() else 'cpu') wlog('Set device {}, will use {} GPUs {}'.format( wargs.gpu_id[0], len(wargs.gpu_id), wargs.gpu_id)) from trainer import * import torch.backends.cudnn as cudnn cudnn.benchmark = True cudnn.enabled = True
import torch as tc from torch import cuda import math import wargs from tools.inputs import Input from tools.utils import init_dir, wlog, _load_model from tools.optimizer import Optim from inputs_handler import * # Check if CUDA is available if cuda.is_available(): wlog( 'CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])' ) else: wlog('Warning: CUDA is not available, train on CPU') if wargs.gpu_id: cuda.set_device(wargs.gpu_id[0]) wlog('Using GPU {}'.format(wargs.gpu_id[0])) from models.rnnsearch import * from models.losser import * from trainer import * from translate import Translator import torch.backends.cudnn as cudnn cudnn.benchmark = True cudnn.enabled = True
from models.losser import * #from tools.tsne import * def encoder_state(model, input): batch_count = len(src_input) point_every, number_every = int(math.ceil(batch_count/100)), int(math.ceil(batch_count/10)) total_state = [] sent_no, words_cnt = 0, 0 fd_attent_matrixs, trgs = None, None for bid in range(batch_count): src = input[bid][1] state, _, _ = model.init(src, "common") total_state.append(state.data) if numpy.mod(sent_no + 1, point_every) == 0: wlog('.', False) if numpy.mod(sent_no + 1, number_every) == 0: wlog('{}'.format(sent_no + 1), False) sent_no += 1 wlog('Done ...') return total_state if __name__ == "__main__": A = argparse.ArgumentParser(prog='NMT translator ... ') A.add_argument('--model-file', dest='model_file', help='model file')
Word2VecDistanceCriterion, HighDimDistance, CosineDistance from models.embedding import WordEmbedding from models.model_builder import build_NMT from tools.utils import init_dir, wlog device_ids = wargs.gpu_ids writer = None if wargs.use_tensorboard: writer = SummaryWriter(wargs.tensorboard_dir) if device_ids is not None: device = tc.device( 'cuda:{}'.format(device_ids[0]) if cuda.is_available() else 'cpu') wlog('Set device {}, will use {} GPUs {}'.format(device_ids[0], len(device_ids), device_ids)) # Check if CUDA is available if cuda.is_available(): wlog( 'CUDA is available, specify device by gpu_ids argument (i.e. gpu_ids=[0, 1, 2])' ) else: if len(device_ids) > 1: wlog('Can not train on multi-gpus, device count: {}'.format( cuda.device_count())) sys.exit(0) else: wlog('Warning: CUDA is not available, train on CPU')
def main(): # if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample' init_dir(wargs.dir_model) init_dir(wargs.dir_valid) src = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_src_suffix)) trg = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_trg_suffix)) src, trg = os.path.abspath(src), os.path.abspath(trg) vocabs = {} if wargs.share_vocab is False: wlog('\nPreparing source vocabulary from {} ... '.format(src)) src_vocab = extract_vocab(src, wargs.src_vcb, wargs.n_src_vcb_plan, wargs.max_seq_len, char=wargs.src_char) wlog('\nPreparing target vocabulary from {} ... '.format(trg)) trg_vocab = extract_vocab(trg, wargs.trg_vcb, wargs.n_trg_vcb_plan, wargs.max_seq_len) n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format( n_src_vcb, n_trg_vcb)) else: wlog('\nPreparing the shared vocabulary from \n\t{}\n\t{}'.format( src, trg)) trg_vocab = src_vocab = extract_vocab(src, wargs.src_vcb, wargs.n_src_vcb_plan, wargs.max_seq_len, share_vocab=True, trg_file=trg) n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size() wlog('Shared vocabulary size: |vocab|={}'.format(src_vocab.size())) vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab wlog('\nPreparing training set from {} and {} ... '.format(src, trg)) trains = {} train_src_tlst, train_trg_tlst = wrap_data( wargs.dir_data, wargs.train_prefix, wargs.train_src_suffix, wargs.train_trg_suffix, src_vocab, trg_vocab, shuffle=True, sort_k_batches=wargs.sort_k_batches, max_seq_len=wargs.max_seq_len, char=wargs.src_char) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size, batch_type=wargs.batch_type, bow=wargs.trg_bow, batch_sort=False, gpu_ids=device_ids) wlog('Sentence-pairs count in training data: {}'.format( len(train_src_tlst))) batch_valid = None if wargs.val_prefix is not None: val_src_file = os.path.join( wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_src_suffix)) val_trg_file = os.path.join( wargs.val_tst_dir, '{}.{}'.format(wargs.val_prefix, wargs.val_ref_suffix)) val_src_file, val_trg_file = os.path.abspath( val_src_file), os.path.abspath(val_trg_file) wlog('\nPreparing validation set from {} and {} ... '.format( val_src_file, val_trg_file)) valid_src_tlst, valid_trg_tlst = wrap_data( wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix, wargs.val_ref_suffix, src_vocab, trg_vocab, shuffle=False, max_seq_len=wargs.dev_max_seq_len, char=wargs.src_char) batch_valid = Input(valid_src_tlst, valid_trg_tlst, batch_size=wargs.valid_batch_size, batch_sort=False, gpu_ids=device_ids) batch_tests = None if wargs.tests_prefix is not None: assert isinstance(wargs.tests_prefix, list), 'Test files should be list.' init_dir(wargs.dir_tests) batch_tests = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) test_file = os.path.abspath(test_file) wlog('\nPreparing test set from {} ... '.format(test_file)) test_src_tlst, _ = wrap_tst_data(test_file, src_vocab, char=wargs.src_char) batch_tests[prefix] = Input(test_src_tlst, None, batch_size=wargs.test_batch_size, batch_sort=False, gpu_ids=device_ids) wlog('\n## Finish to Prepare Dataset ! ##\n') src_emb = WordEmbedding(n_src_vcb, wargs.d_src_emb, wargs.input_dropout, wargs.position_encoding, prefix='Src') trg_emb = WordEmbedding(n_trg_vcb, wargs.d_trg_emb, wargs.input_dropout, wargs.position_encoding, prefix='Trg') # share the embedding matrix between the source and target if wargs.share_vocab is True: src_emb.we.weight = trg_emb.we.weight nmtModel = build_NMT(src_emb, trg_emb) if device_ids is not None: wlog('push model onto GPU {} ... '.format(device_ids[0]), 0) nmtModel_par = nn.DataParallel(nmtModel, device_ids=device_ids) nmtModel_par.to(device) else: wlog('push model onto CPU ... ', 0) nmtModel.to(tc.device('cpu')) wlog('done.') if wargs.pre_train is not None: wlog(wargs.pre_train) assert os.path.exists(wargs.pre_train) from tools.utils import load_model _dict = load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 5: # model_dict, e_idx, e_bidx, n_steps, optim = _dict['model'], _dict['epoch'], _dict['batch'], _dict['steps'], _dict['optim'] model_dict, e_idx, e_bidx, n_steps, optim = _dict elif len(_dict) == 4: model_dict, e_idx, e_bidx, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) # wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, init_D=wargs.param_init_D, a=float(wargs.u_gain)) # wargs.start_epoch = e_idx + 1 # # 不重新开始 # optim.n_current_steps = 0 else: optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm) for n, p in nmtModel.named_parameters(): # bias can not be initialized uniformly if 'norm' in n: wlog('ignore layer norm init ...') continue if 'emb' in n: wlog('ignore word embedding weight init ...') continue if 'vcb_proj' in n: wlog('ignore vcb_proj weight init ...') continue init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain)) # if wargs.encoder_type != 'att' and wargs.decoder_type != 'att': # init_params(p, n, init_D=wargs.param_init_D, a=float(wargs.u_gain)) # wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('parameters number: {}/{}'.format(pcnt1, pcnt2)) # wlog('\n' + '*' * 30 + ' trainable parameters ' + '*' * 30) # for n, p in nmtModel.named_parameters(): # if p.requires_grad: wlog('{:60} : {}'.format(n, p.size())) opt_state = None if wargs.pre_train: opt_state = optim.optimizer.state_dict() if wargs.use_reinfore_ce is False: criterion = LabelSmoothingCriterion( trg_emb.n_vocab, label_smoothing=wargs.label_smoothing) else: word2vec = tc.load(wargs.word2vec_weight)['w2v'] # criterion = Word2VecDistanceCriterion(word2vec) criterion = CosineDistance(word2vec) if device_ids is not None: wlog('push criterion onto GPU {} ... '.format(device_ids[0]), 0) criterion = criterion.to(device) wlog('done.') # if wargs.reinfore_type == 0 or wargs.reinfore_type == 1: # param = list(nmtModel.parameters()) # else: # param = list(nmtModel.parameters()) + list(criterion.parameters()) param = list(nmtModel.parameters()) optim.init_optimizer(param) lossCompute = MultiGPULossCompute( nmtModel.generator, criterion, wargs.d_model if wargs.decoder_type == 'att' else 2 * wargs.d_enc_hid, n_trg_vcb, trg_emb, nmtModel.bowMapper, loss_norm=wargs.loss_norm, chunk_size=wargs.chunk_size, device_ids=device_ids) trainer = Trainer(nmtModel_par, batch_train, vocabs, optim, lossCompute, nmtModel, batch_valid, batch_tests, writer) trainer.train() writer.close()
def __init__(self, trg_emb=None): super(EMBLossCriterion, self).__init__() wlog('using the embedding-based loss') assert trg_emb is not None, 'embedding loss needs target embedding' self.trg_word_emb = trg_emb.we
from tools.utils import * from tools.utils import init_dir, wlog, _load_model from translate import Translator from inputs_handler import extract_vocab, val_wrap_data, wrap_data from models.losser import * if __name__ == "__main__": A = argparse.ArgumentParser(prog='NMT translator ... ') A.add_argument('--model-file', dest='model_file', help='model file') A.add_argument('--test-file', dest='test_file', default=None, help='the input test file path we will translate') args = A.parse_args() model_file = args.model_file wlog('Using model: {}'.format(model_file)) from models.rnnsearch import * src_vocab = extract_vocab(None, wargs.src_dict) trg_vocab = extract_vocab(None, wargs.trg_dict) src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size)) wlog('Start decoding ... init model ... ', 0) nmtModel = NMT(src_vocab_size, trg_vocab_size) if wargs.gpu_id: cuda.set_device(wargs.gpu_id[0]) nmtModel.cuda() wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0]))
def main(): #if wargs.ss_type is not None: assert wargs.model == 1, 'Only rnnsearch support schedule sample' init_dir(wargs.dir_model) init_dir(wargs.dir_valid) src = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_src_suffix)) trg = os.path.join( wargs.dir_data, '{}.{}'.format(wargs.train_prefix, wargs.train_trg_suffix)) vocabs = {} wlog('\n[o/Subword] Preparing source vocabulary from {} ... '.format(src)) src_vocab = extract_vocab(src, wargs.src_dict, wargs.src_dict_size, wargs.max_seq_len, char=wargs.src_char) wlog('\n[o/Subword] Preparing target vocabulary from {} ... '.format(trg)) trg_vocab = extract_vocab(trg, wargs.trg_dict, wargs.trg_dict_size, wargs.max_seq_len) src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format( src_vocab_size, trg_vocab_size)) vocabs['src'], vocabs['trg'] = src_vocab, trg_vocab wlog('\nPreparing training set from {} and {} ... '.format(src, trg)) trains = {} train_src_tlst, train_trg_tlst = wrap_data(wargs.dir_data, wargs.train_prefix, wargs.train_src_suffix, wargs.train_trg_suffix, src_vocab, trg_vocab, max_seq_len=wargs.max_seq_len, char=wargs.src_char) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size, batch_sort=True) wlog('Sentence-pairs count in training data: {}'.format( len(train_src_tlst))) batch_valid = None if wargs.val_prefix is not None: val_src_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix) val_trg_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_ref_suffix) wlog('\nPreparing validation set from {} and {} ... '.format( val_src_file, val_trg_file)) valid_src_tlst, valid_trg_tlst = wrap_data( wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix, wargs.val_ref_suffix, src_vocab, trg_vocab, shuffle=False, sort_data=False, max_seq_len=wargs.dev_max_seq_len, char=wargs.src_char) batch_valid = Input(valid_src_tlst, valid_trg_tlst, 1, volatile=True, batch_sort=False) batch_tests = None if wargs.tests_prefix is not None: assert isinstance(wargs.tests_prefix, list), 'Test files should be list.' init_dir(wargs.dir_tests) batch_tests = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) wlog('\nPreparing test set from {} ... '.format(test_file)) test_src_tlst, _ = wrap_tst_data(test_file, src_vocab, char=wargs.src_char) batch_tests[prefix] = Input(test_src_tlst, None, 1, volatile=True, batch_sort=False) wlog('\n## Finish to Prepare Dataset ! ##\n') nmtModel = NMT(src_vocab_size, trg_vocab_size) if wargs.pre_train is not None: assert os.path.exists(wargs.pre_train) _dict = _load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 4: model_dict, eid, bid, optim = _dict elif len(_dict) == 5: model_dict, class_dict, eid, bid, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, True) wargs.start_epoch = eid + 1 else: for n, p in nmtModel.named_parameters(): init_params(p, n, True) optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm, learning_rate_decay=wargs.learning_rate_decay, start_decay_from=wargs.start_decay_from, last_valid_bleu=wargs.last_valid_bleu, model=wargs.model) if wargs.gpu_id is not None: wlog('Push model onto GPU {} ... '.format(wargs.gpu_id), 0) nmtModel.cuda() else: wlog('Push model onto CPU ... ', 0) nmtModel.cpu() wlog('done.') wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2)) optim.init_optimizer(nmtModel.parameters()) trainer = Trainer(nmtModel, batch_train, vocabs, optim, batch_valid, batch_tests) trainer.train()
A.add_argument('--beam-size', dest='beam_size', default=wargs.beam_size, help='beamsize') A.add_argument('--len-norm', dest='len_norm', type=int, default=1, help='During searching, whether we normalize accumulated loss by length.') ''' args = A.parse_args() model_file = args.model_file ''' search_mode = args.search_mode beam_size = args.beam_size lenNorm = args.len_norm ''' if wargs.share_vocab is False: wlog('Starting load both vocabularies ... ') assert os.path.exists(wargs.src_vcb) and os.path.exists(wargs.trg_vcb), 'need vocabulary ...' src_vocab = extract_vocab(None, wargs.src_vcb) trg_vocab = extract_vocab(None, wargs.trg_vcb) else: wlog('Starting load shared vocabularies ... ') assert os.path.exists(wargs.src_vcb), 'need shared vocabulary ...' trg_vocab = src_vocab = extract_vocab(None, wargs.src_vcb) n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format(n_src_vcb, n_trg_vcb)) model_dict, e_idx, e_bidx, n_steps, optim = load_model(model_file) from models.embedding import WordEmbedding src_emb = WordEmbedding(n_src_vcb, wargs.d_src_emb, position_encoding=wargs.position_encoding, prefix='Src') trg_emb = WordEmbedding(n_trg_vcb, wargs.d_trg_emb,
def main(): init_dir(wargs.dir_model) init_dir(wargs.dir_valid) vocab_data = {} train_srcD_file = wargs.src_vocab_from wlog('\nPreparing out of domain source vocabulary from {} ... '.format( train_srcD_file)) src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size) #DANN train_srcD_file_domain = wargs.src_domain_vocab_from wlog('\nPreparing in domain source vocabulary from {} ...'.format( train_srcD_file_domain)) src_vocab = updata_vocab(train_srcD_file_domain, src_vocab, wargs.src_dict, wargs.src_dict_size) vocab_data['src'] = src_vocab train_trgD_file = wargs.trg_vocab_from wlog('\nPreparing out of domain target vocabulary from {} ... '.format( train_trgD_file)) trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size) #DANN train_trgD_file_domain = wargs.trg_domain_vocab_from wlog('\nPreparing in domain target vocabulary from {} ... '.format( train_trgD_file_domain)) trg_vocab = updata_vocab(train_trgD_file_domain, trg_vocab, wargs.trg_dict, wargs.trg_dict_size) vocab_data['trg'] = trg_vocab train_src_file = wargs.train_src train_trg_file = wargs.train_trg if wargs.fine_tune is False: wlog('\nPreparing out of domain training set from {} and {} ... '. format(train_src_file, train_trg_file)) train_src_tlst, train_trg_tlst = wrap_data( train_src_file, train_trg_file, vocab_data['src'], vocab_data['trg'], max_seq_len=wargs.max_seq_len) else: wlog('\nNo out of domain trainin set ...') #DANN train_src_file_domain = wargs.train_src_domain train_trg_file_domain = wargs.train_trg_domain wlog('\nPreparing in domain training set from {} and {}...'.format( train_src_file_domain, train_trg_file_domain)) train_src_tlst_domain, train_trg_tlst_domain = wrap_data( train_src_file_domain, train_trg_file_domain, vocab_data['src'], vocab_data['trg'], max_seq_len=wargs.max_seq_len) ''' list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...] no padding ''' valid_file = '{}{}.{}'.format(wargs.val_tst_dir, wargs.val_prefix, wargs.val_src_suffix) wlog('\nPreparing validation set from {} ... '.format(valid_file)) valid_src_tlst, valid_src_lens = val_wrap_data(valid_file, src_vocab) if wargs.fine_tune is False: wlog('Out of domain Sentence-pairs count in training data: {}'.format( len(train_src_tlst))) wlog('In domain Sentence-pairs count in training data: {}'.format( len(train_src_tlst_domain))) src_vocab_size, trg_vocab_size = vocab_data['src'].size( ), vocab_data['trg'].size() wlog('Vocabulary size: |source|={}, |target|={}'.format( src_vocab_size, trg_vocab_size)) if wargs.fine_tune is False: batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size) else: batch_train = None batch_valid = Input(valid_src_tlst, None, 1, volatile=True) #DANN batch_train_domain = Input(train_src_tlst_domain, train_trg_tlst_domain, wargs.batch_size) tests_data = None if wargs.tests_prefix is not None: init_dir(wargs.dir_tests) tests_data = {} for prefix in wargs.tests_prefix: init_dir(wargs.dir_tests + '/' + prefix) test_file = '{}{}.{}'.format(wargs.val_tst_dir, prefix, wargs.val_src_suffix) wlog('Preparing test set from {} ... '.format(test_file)) test_src_tlst, _ = val_wrap_data(test_file, src_vocab) tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True) sv = vocab_data['src'].idx2key tv = vocab_data['trg'].idx2key nmtModel = NMT(src_vocab_size, trg_vocab_size) if wargs.pre_train is not None: assert os.path.exists(wargs.pre_train), 'Requires pre-trained model' _dict = _load_model(wargs.pre_train) # initializing parameters of interactive attention model class_dict = None if len(_dict) == 4: model_dict, eid, bid, optim = _dict elif len(_dict) == 5: model_dict, class_dict, eid, bid, optim = _dict for name, param in nmtModel.named_parameters(): if name in model_dict: param.requires_grad = not wargs.fix_pre_params param.data.copy_(model_dict[name]) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.weight'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.weight']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) elif name.endswith('map_vocab.bias'): if class_dict is not None: param.requires_grad = not wargs.fix_pre_params param.data.copy_(class_dict['map_vocab.bias']) wlog('{:7} -> grad {}\t{}'.format('Model', param.requires_grad, name)) else: init_params(param, name, True) wargs.start_epoch = eid + 1 else: for n, p in nmtModel.named_parameters(): init_params(p, n, True) optim = Optim(wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm, learning_rate_decay=wargs.learning_rate_decay, start_decay_from=wargs.start_decay_from, last_valid_bleu=wargs.last_valid_bleu) if wargs.gpu_id: nmtModel.cuda() wlog('Push model onto GPU[{}] ... '.format(wargs.gpu_id[0])) else: nmtModel.cpu() wlog('Push model onto CPU ... ') wlog(nmtModel) wlog(optim) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2)) optim.init_optimizer(nmtModel.parameters()) trainer = Trainer(nmtModel, batch_train, batch_train_domain, vocab_data, optim, batch_valid, tests_data) trainer.train()
A.add_argument('--beam-size', dest='beam_size', default=wargs.beam_size, help='beamsize') A.add_argument('--len-norm', dest='len_norm', type=int, default=1, help='During searching, whether we normalize accumulated loss by length.') ''' args = A.parse_args() model_file = args.model_file ''' search_mode = args.search_mode beam_size = args.beam_size lenNorm = args.len_norm ''' if wargs.share_vocab is False: wlog('Starting load both vocabularies ... ') assert os.path.exists(wargs.src_vcb) and os.path.exists( wargs.trg_vcb), 'need vocabulary ...' src_vocab = extract_vocab(None, wargs.src_vcb) trg_vocab = extract_vocab(None, wargs.trg_vcb) else: wlog('Starting load shared vocabularies ... ') assert os.path.exists(wargs.src_vcb), 'need shared vocabulary ...' trg_vocab = src_vocab = extract_vocab(None, wargs.src_vcb) n_src_vcb, n_trg_vcb = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format( n_src_vcb, n_trg_vcb)) # wv = KeyedVectors.load('word_vector_en', mmap='r') # voc = list(wv.vocab) # weight = tc.zeros(n_trg_vcb, 100)
def main(): # Check if CUDA is available if cuda.is_available(): wlog('CUDA is available, specify device by gpu_id argument (i.e. gpu_id=[3])') else: wlog('Warning: CUDA is not available, try CPU') if wargs.gpu_id: cuda.set_device(wargs.gpu_id[0]) wlog('Using GPU {}'.format(wargs.gpu_id[0])) init_dir(wargs.dir_model) init_dir(wargs.dir_valid) init_dir(wargs.dir_tests) for prefix in wargs.tests_prefix: if not prefix == wargs.val_prefix: init_dir(wargs.dir_tests + '/' + prefix) wlog('Preparing data ... ', 0) train_srcD_file = wargs.dir_data + 'train.10k.zh5' wlog('\nPreparing source vocabulary from {} ... '.format(train_srcD_file)) src_vocab = extract_vocab(train_srcD_file, wargs.src_dict, wargs.src_dict_size) train_trgD_file = wargs.dir_data + 'train.10k.en5' wlog('\nPreparing target vocabulary from {} ... '.format(train_trgD_file)) trg_vocab = extract_vocab(train_trgD_file, wargs.trg_dict, wargs.trg_dict_size) train_src_file = wargs.dir_data + 'train.10k.zh0' train_trg_file = wargs.dir_data + 'train.10k.en0' wlog('\nPreparing training set from {} and {} ... '.format(train_src_file, train_trg_file)) train_src_tlst, train_trg_tlst = wrap_data(train_src_file, train_trg_file, src_vocab, trg_vocab) #list [torch.LongTensor (sentence), torch.LongTensor, torch.LongTensor, ...], no padding wlog('Sentence-pairs count in training data: {}'.format(len(train_src_tlst))) src_vocab_size, trg_vocab_size = src_vocab.size(), trg_vocab.size() wlog('Vocabulary size: |source|={}, |target|={}'.format(src_vocab_size, trg_vocab_size)) batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size) tests_data = None if wargs.tests_prefix is not None: tests_data = {} for prefix in wargs.tests_prefix: test_file = wargs.val_tst_dir + prefix + '.src' test_src_tlst, _ = val_wrap_data(test_file, src_vocab) # we select best model by nist03 testing data if prefix == wargs.val_prefix: wlog('\nPreparing model-select set from {} ... '.format(test_file)) batch_valid = Input(test_src_tlst, None, 1, volatile=True, prefix=prefix) else: wlog('\nPreparing test set from {} ... '.format(test_file)) tests_data[prefix] = Input(test_src_tlst, None, 1, volatile=True) nmtModel = NMT() classifier = Classifier(wargs.out_size, trg_vocab_size) if wargs.pre_train: model_dict, class_dict, eid, bid, optim = load_pytorch_model(wargs.pre_train) if isinstance(optim, list): _, _, optim = optim # initializing parameters of interactive attention model for p in nmtModel.named_parameters(): p[1].data = model_dict[p[0]] for p in classifier.named_parameters(): p[1].data = class_dict[p[0]] #wargs.start_epoch = eid + 1 else: for p in nmtModel.parameters(): init_params(p, uniform=True) for p in classifier.parameters(): init_params(p, uniform=True) optim = Optim( wargs.opt_mode, wargs.learning_rate, wargs.max_grad_norm, learning_rate_decay=wargs.learning_rate_decay, start_decay_from=wargs.start_decay_from, last_valid_bleu=wargs.last_valid_bleu ) if wargs.gpu_id: wlog('Push model onto GPU ... ') nmtModel.cuda() classifier.cuda() else: wlog('Push model onto CPU ... ') nmtModel.cpu() classifier.cuda() nmtModel.classifier = classifier wlog(nmtModel) pcnt1 = len([p for p in nmtModel.parameters()]) pcnt2 = sum([p.nelement() for p in nmtModel.parameters()]) wlog('Parameters number: {}/{}'.format(pcnt1, pcnt2)) optim.init_optimizer(nmtModel.parameters()) #tor = Translator(nmtModel, src_vocab.idx2key, trg_vocab.idx2key) #tor.trans_tests(tests_data, pre_dict['epoch'], pre_dict['batch']) trainer = Trainer(nmtModel, src_vocab.idx2key, trg_vocab.idx2key, optim, trg_vocab_size) dev_src0 = wargs.dir_data + 'dev.1k.zh0' dev_trg0 = wargs.dir_data + 'dev.1k.en0' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src0, dev_trg0)) dev_src0, dev_trg0 = wrap_data(dev_src0, dev_trg0, src_vocab, trg_vocab) wlog(len(train_src_tlst)) # add 1000 to train train_all_chunks = (train_src_tlst, train_trg_tlst) dh = DataHisto(train_all_chunks) dev_src1 = wargs.dir_data + 'dev.1k.zh1' dev_trg1 = wargs.dir_data + 'dev.1k.en1' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src1, dev_trg1)) dev_src1, dev_trg1 = wrap_data(dev_src1, dev_trg1, src_vocab, trg_vocab) dev_src2 = wargs.dir_data + 'dev.1k.zh2' dev_trg2 = wargs.dir_data + 'dev.1k.en2' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src2, dev_trg2)) dev_src2, dev_trg2 = wrap_data(dev_src2, dev_trg2, src_vocab, trg_vocab) dev_src3 = wargs.dir_data + 'dev.1k.zh3' dev_trg3 = wargs.dir_data + 'dev.1k.en3' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src3, dev_trg3)) dev_src3, dev_trg3 = wrap_data(dev_src3, dev_trg3, src_vocab, trg_vocab) dev_src4 = wargs.dir_data + 'dev.1k.zh4' dev_trg4 = wargs.dir_data + 'dev.1k.en4' wlog('\nPreparing dev set for tuning from {} and {} ... '.format(dev_src4, dev_trg4)) dev_src4, dev_trg4 = wrap_data(dev_src4, dev_trg4, src_vocab, trg_vocab) wlog(len(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0)) dev_input = Input(dev_src4+dev_src3+dev_src2+dev_src1+dev_src0, dev_trg4+dev_trg3+dev_trg2+dev_trg1+dev_trg0, wargs.batch_size) trainer.train(dh, dev_input, 0, batch_valid, tests_data, merge=True, name='DH_{}'.format('dev')) ''' chunk_size = 1000 rand_ids = tc.randperm(len(train_src_tlst))[:chunk_size * 1000] rand_ids = rand_ids.split(chunk_size) #train_chunks = [(dev_src, dev_trg)] train_chunks = [] for k in range(len(rand_ids)): rand_id = rand_ids[k] chunk_src_tlst = [train_src_tlst[i] for i in rand_id] chunk_trg_tlst = [train_trg_tlst[i] for i in rand_id] #wlog('Sentence-pairs count in training data: {}'.format(len(src_samples_train))) #batch_train = Input(train_src_tlst, train_trg_tlst, wargs.batch_size) #batch_train = Input(src_samples_train, trg_samples_train, wargs.batch_size) train_chunks.append((chunk_src_tlst, chunk_trg_tlst)) chunk_D0 = train_chunks[0] dh = DataHisto(chunk_D0) c0_input = Input(chunk_D0[0], chunk_D0[1], wargs.batch_size) trainer.train(dh, c0_input, 0, batch_valid, tests_data, merge=False, name='DH_{}'.format(0)) for k in range(1, len(train_chunks)): wlog('*' * 30, False) wlog(' Next Data {} '.format(k), False) wlog('*' * 30) chunk_Dk = train_chunks[k] ck_input = Input(chunk_Dk[0], chunk_Dk[1], wargs.batch_size) trainer.train(dh, ck_input, k, batch_valid, tests_data, merge=True, name='DH_{}'.format(k)) dh.add_batch_data(chunk_Dk) ''' if tests_data and wargs.final_test: bestModel = NMT() classifier = Classifier(wargs.out_size, trg_vocab_size) assert os.path.exists(wargs.best_model) model_dict = tc.load(wargs.best_model) best_model_dict = model_dict['model'] best_model_dict = {k: v for k, v in best_model_dict.items() if 'classifier' not in k} bestModel.load_state_dict(best_model_dict) classifier.load_state_dict(model_dict['class']) if wargs.gpu_id: wlog('Push NMT model onto GPU ... ') bestModel.cuda() classifier.cuda() else: wlog('Push NMT model onto CPU ... ') bestModel.cpu() classifier.cpu() bestModel.classifier = classifier tor = Translator(bestModel, src_vocab.idx2key, trg_vocab.idx2key) tor.trans_tests(tests_data, model_dict['epoch'], model_dict['batch'])