def __init__(self, path, lang, n_layers=3, d_model=128, head=4, d_ff=512, dropout=0.2, lr=0.001, max_len=30): super(VanillaTransformer, self).__init__() if path: self.model = torch.load(path) else: self.model = make_model(lang.vectors, N=n_layers, d_model=d_model, d_ff=d_ff, h=head, dropout=dropout) self.lang = lang self.opt = torch.optim.Adam(self.model.parameters(), lr=lr) self.max_len = min(30, max_len) self.model.cuda() self.name = 'VanillaTransformer'
def loadModel(PATH, SRC, TGT): state = torch.load(PATH) model = transformer.make_model(len(SRC.vocab), len(TGT.vocab)) model.load_state_dict(state['state_dict']) batchSize = state['batchSize'] epoch = state['epoch'] return model, batchSize, epoch
def __init__(self, config): super(TransEncoder, self).__init__(config) self.config = config self.w2s = SequentialRepr(config, \ input_dim = config.embed_dim, mode = "lstm") self.pe = PositionalEncoding(config.hidden_dim, config.dropout) self.s2d = make_model(N = config.num_layers,\ d_model = config.hidden_dim, dropout = config.dropout) self.layer_norm = LayerNorm(config.hidden_dim) self.add_att = AttNet(config, config.hidden_dim) self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
def __init__(self, config): super(TransEncoder, self).__init__(config) self.sent_repr_dim = config.hidden_dim self.w2s = SequentialRepr(config, \ input_dim = config.embed_dim, mode = "lstm") # self.w2s_tl = SequentialRepr(config,\ # input_dim = config.embed_dim, mode = "lstm") # self.s2d = SequentialRepr(config, # input_dim = config.hidden_dim, mode = "lstm") self.pe = PositionalEncoding(self.sent_repr_dim, config.dropout) self.s2d = make_model(N = config.num_layers,\ d_model = self.sent_repr_dim, dropout = config.dropout) self.satt_layer = AttNet(config, config.hidden_dim) self.datt_layer = AttNet(config, config.hidden_dim * 2) self.dropout = nn.Dropout(p=config.dropout) self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
PATH = 'data_balance.csv' start_time = time.time() print("Loading data...") train_data = build_dataset('./train.csv') dev_data = build_dataset('./test.csv') #print(train_data) train_iter = build_iterator(train_data) dev_iter = build_iterator(dev_data) test_iter = dev_iter time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train model = make_model().to(Config.device) #model=ff().to('cuda') print('init') ''' for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) ''' for name, w in model.named_parameters(): if 'embedding' not in name: if len(w.size()) < 2: continue if 'weight' in name: nn.init.xavier_normal_(w)
'd_model': args.dmodel, 'N': args.nstacklayers, 'h': args.heads, 'N_dense': args.Ndense, 'lambda_attention': args.lattn, 'lambda_distance': args.ldist, 'leaky_relu_slope': 0.1, 'dense_output_nonlinearity': 'relu', 'distance_matrix_kernel': 'exp', 'dropout': args.dropout, 'aggregation_type': 'mean' } print('Making Model') model = make_model(**model_params) if args.pretrain: print(f'Loading pretrained weights from: {args.pretrain}') pretrained_state_dict = torch.load(args.pretrain) model_state_dict = model.state_dict() for name, param in pretrained_state_dict.items(): if 'generator' in name: continue if isinstance(param, torch.nn.Parameter): param = param.data model_state_dict[name].copy_(param) param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) print('Number of parameters:', param_count) if args.wandb: wandb.watch(model, 'all')
start = time.time() tokens = 0 return total_loss / total_tokens if __name__ == "__main__": dataset = NMTDataset.load_dataset_and_make_vectorizer( # "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_100000.csv" "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_70w.csv" ) src_vocab_size = len(dataset.get_vectorizer().source_vocab) tgt_vocab_size = len(dataset.get_vectorizer().target_vocab) padding_idx = dataset.get_vectorizer().target_vocab.lookup_token('<MASK>') criterion = LabelSmoothing(size=tgt_vocab_size, padding_idx=0, smoothing=0.1) criterion.cuda() model = make_model(src_vocab_size, tgt_vocab_size, 6) model.cuda() model_opt = NoamOpt( model.src_embed[0].d_model, 1, 8000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) loss_compute = SimpleLossCompute(model.generator, criterion, model_opt) # train model.train() for epcho in range(10): data_iter = generate_nmt_batches(dataset, 16, device="cuda") run_epoch(data_iter, model, loss_compute)
test_dataset = DL.SNLT_Dataset(split='test', gloss=True) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) src_vocab = len(train_dataset.gloss_dictionary.idx2word) trg_vocab = len(train_dataset.dictionary.idx2word) device = 'cpu' model_cp = args.model N_blocks = args.n_blocks d_model = args.d_model d_ff = args.d_ff att_heads = args.att_heads model = tf.make_model(src_vocab, trg_vocab, N=N_blocks, d_model=d_model, d_ff=d_ff, h=att_heads) model.load_state_dict( torch.load(model_cp, map_location=torch.device(device))) score_model(model, test_loader, device, train_dataset.dictionary, verbose=True) file_path = './models/G2T/NLL/bs128_NLL/generated_corpus.txt' #write_corpus(pred_corpus, file_path)
#return loss.data[0] * norm # TODO return loss.item() * norm # Train the simple copy task. device = "cuda" nrof_epochs = 20 batch_size = 32 V = 11 # 词典的数量 sequence_len = 15 # 生成的序列数据的长度 #nrof_batch_train_epoch = 20 # 训练时每个epoch多少个batch #nrof_batch_valid_epoch = 5 # 验证时每个epoch多少个batch nrof_batch_train_epoch = 30 # 训练时每个epoch多少个batch nrof_batch_valid_epoch = 10 # 验证时每个epoch多少个batch criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0) model = make_model(V, V, N=2) optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9) model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400, optimizer) if device == "cuda": model.cuda() for epoch in range(nrof_epochs): print(f"\nepoch {epoch}") print("train...") model.train() data_iter = data_gen(V, sequence_len, batch_size, nrof_batch_train_epoch, device) loss_compute = SimpleLossCompute(model.generator, criterion, model_opt)
batchSize = state['batchSize'] epoch = state['epoch'] return model, batchSize, epoch """**Initialize model, optimizer, criterion and iterators**""" if (loadPreTrain or justEvaluate): print("Loading pre-trained network") model, BATCH_SIZE, previousEpochNb = loadModel(modelSavePath, SRC, TGT) else: print("initializing network") model = transformer.make_model(len(SRC.vocab), len(TGT.vocab), N=6) model_opt = transformer.NoamOpt( model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001, betas=(0.9, 0.98), eps=1e-8)) model.cuda() #criterion = transformer.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion = nn.CrossEntropyLoss() #criterion.cuda() print("Initializing iterators") #train_iter = MyIterator(train, batch_size=BATCH_SIZE, device = device, # repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
print('Training end to end model') else: import transformer as tf print('Training gloss to text model') src_vocab = len(train_dataset.gloss_dictionary.idx2word) trg_vocab = len(train_dataset.dictionary.idx2word) train_loader = DataLoader(train_dataset, batch_size=args.b_size, shuffle=True, num_workers = args.workers) dev_loader = DataLoader(dev_dataset, batch_size=args.b_size, shuffle=True, num_workers = args.workers) test_loader = DataLoader(test_dataset, batch_size=1) criterion = tf.LabelSmoothing(size=trg_vocab, padding_idx=0, smoothing=0.0) model = tf.make_model(src_vocab, trg_vocab, N=args.n_blocks, d_model=args.d_model, d_ff=args.d_ff, h=args.att_heads) if args.checkpoint is not None: model.load_state_dict(torch.load(args.checkpoint)) print('Loaded state_dict to the model before starting train') model.to(device) model_opt = tf.NoamOpt(args.d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-9)) if __name__ == '__main__': mp.set_start_method('spawn') train_losses = [] dev_losses = [] best_loss = None
def main(): args = parser.parse_args() # load dataset #sent_pairs = load_dataset_aihub() sent_pairs = load_dataset_aihub(path='data/') #random.seed(100) #random.shuffle(sent_pairs) # make dataloader with dataset # FIXME: RuntimeError: Internal: unk is not defined. inp_lang, out_lang = get_sentencepiece(src_prefix, trg_prefix) log.info('loaded input sentencepiece model: {}'.format(src_prefix)) log.info('loaded output sentencepiece model: {}'.format(trg_prefix)) # split train/valid sentence pairs n_train = int(len(sent_pairs) * 0.8) valid_sent_pairs = sent_pairs[n_train:] log.info('valid_sent_pairs: {}'.format(len(valid_sent_pairs))) # these are used for defining tokenize method and some reserved words SRC = KRENField(pad_token='<pad>') TRG = KRENField(pad_token='<pad>') # load SRC/TRG if not os.path.exists('spm/{}.model'.format(src_prefix)) or \ not os.path.exists('spm/{}.model'.format(trg_prefix)): # build vocabulary SRC.build_vocab(train.src) TRG.build_vocab(train.trg) torch.save(SRC.vocab, 'spm/{}.spm'.format(src_prefix), pickle_module=dill) torch.save(TRG.vocab, 'spm/{}.spm'.format(trg_prefix), pickle_module=dill) log.info('input vocab was created and saved: spm/{}.spm'.format(src_prefix)) log.info('output vocab was created and saved: spm/{}.spm'.format(trg_prefix)) else: src_vocab = torch.load('spm/{}.spm'.format(src_prefix), pickle_module=dill) trg_vocab = torch.load('spm/{}.spm'.format(trg_prefix), pickle_module=dill) SRC.vocab = src_vocab TRG.vocab = trg_vocab log.info('input vocab was loaded: spm/{}.spm'.format(src_prefix)) log.info('output vocab was loaded: spm/{}.spm'.format(trg_prefix)) SRC.tokenize = inp_lang.EncodeAsIds TRG.tokenize = out_lang.EncodeAsIds SRC.detokenize = inp_lang.DecodeIds TRG.detokenize = out_lang.DecodeIds # make dataloader from KRENDataset #train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG), inp_lang, out_lang, encoding_type='ids') train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG), inp_lang, out_lang, encoding_type='pieces') valid_iter = MyIterator(valid, batch_size=100, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) #encoding_decoding_test(SRC, sent_pairs[0][0]) #encoding_decoding_test(TRG, sent_pairs[0][1]) # fix torch randomness fix_torch_randomness() # define input/output size args.inp_n_words = src_vocab_size args.out_n_words = trg_vocab_size log.info('inp_n_words: {} out_n_words: {}'.format(args.inp_n_words, args.out_n_words)) # define model if args.small_model: model = make_model( args.inp_n_words, args.out_n_words, dropout=args.dropout) else: model = make_model( args.inp_n_words, args.out_n_words, N=N, d_model=args.d_model, d_ff=args.d_ff, h=args.h, dropout=args.dropout) #model_name_full_path = './models/model-tmp.bin' model_name_full_path = args.modelnm checkpoint = torch.load(model_name_full_path) state_dict = checkpoint['state_dict'] model.load_state_dict(state_dict) model.cuda() model.eval() for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != SRC.vocab.stoi["<pad>"]).unsqueeze(-2) print(SRC.detokenize(src.numpy()[0].tolist())) print("Input::", end="\t") for i in range(src.size(1)): sym = SRC.vocab.itos[src[0, i].data.item()] if sym == "</s>": break print(sym, end =" ") print('') out = greedy_decode(model, src.cuda(), src_mask.cuda(), max_len=60, start_symbol=TRG.vocab.stoi["<s>"]) print("Translation with TRG:", end="\t") for i in range(1, out.size(1)): sym = TRG.vocab.itos[out[0, i].data.item()] if sym == "</s>": break print(sym, end =" ") print('') print("Translation with tokenize:", end="\t") out_list = [] for i in range(1, out.size(1)): sym = out[0, i].data.item() if sym == TRG.vocab.stoi['</s>']: break out_list.append(sym) print(TRG.detokenize(out_list)) print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = TRG.vocab.itos[batch.trg.data[i, 0].item()] if sym == "</s>": break print(sym, end =" ") print('') print('---------------')
def __init__(self, params, newFeats=0, behavFeats=0): super(highwayNet, self).__init__() self.newFeats = newFeats self.behavFeats = behavFeats self.att_weights = None self.use_spatial_attention = True ## Unpack arguments self.params = params ## Use gpu flag self.use_cuda = params.use_cuda # Flag for maneuver based (True) vs uni-modal decoder (False) self.use_maneuvers = params.use_maneuvers if params.use_grid == 2: self.use_grid_soc = True self.use_grid = False else: self.use_grid = params.use_grid self.use_grid_soc = False # Transformer architecture related self.use_transformer = params.use_transformer self.teacher_forcing_ratio = 0.0 # Set to 0: we overfit otherwise # RNN-LSTM Seq2seq architecture related self.use_bidir = params.use_bidir # NB: seq2seq uses a bidir encoder (always) self.use_seq2seq = params.use_seq2seq if self.use_seq2seq: self.use_bidir = True # RNN-LSTM with Attention architecture related self.use_attention = params.use_attention if self.use_attention: self.use_bidir = True # Flag for train mode (True) vs test-mode (False) self.train_flag = params.train_flag if self.train_flag is False: self.teacher_forcing_ratio = 0.0 ## Sizes of network layers self.encoder_size = params.encoder_size self.decoder_size = params.decoder_size self.in_length = params.in_length self.out_length = params.out_length self.grid_size = params.grid_size self.soc_conv_depth = params.soc_conv_depth self.conv_3x1_depth = params.conv_3x1_depth self.dyn_embedding_size = params.dyn_embedding_size self.input_embedding_size = params.input_embedding_size self.num_lat_classes = params.num_lat_classes self.num_lon_classes = params.num_lon_classes self.soc_embedding_size = (( (params.grid_size[0] - 4) + 1) // 2) * self.conv_3x1_depth ## Define network weights # TRANSFORMER if self.use_transformer: src_feats = 2 + self.newFeats # (X,Y) point or (X,Y,A/V) tgt_feats = 2 # (X,Y) point tgt_params = 5 # 5 params for bivariate Gaussian distrib if self.use_grid or self.use_grid_soc: src_ngrid = self.in_length # with soc else: src_ngrid = 0 # without soc if self.use_maneuvers: d_lon = self.num_lon_classes d_lat = self.num_lat_classes else: d_lon = 0 d_lat = 0 if self.use_grid_soc: self.transformer = tsf.make_model( src_feats, tgt_feats, tgt_params=tgt_params, src_ngrid=src_ngrid, src_lon=d_lon, src_lat=d_lat, src_soc_emb_size=self.soc_embedding_size, src_grid=self.params.grid_size) else: self.transformer = tsf.make_model( src_feats, tgt_feats, tgt_params=tgt_params, src_ngrid=src_ngrid, src_lon=d_lon, src_lat=d_lat, src_grid=self.params.grid_size) print("TRANSFORMER:", self.transformer) self.batch = tsf.Batch() # Input embedding layer self.n_feats = 2 + self.newFeats self.ip_emb = torch.nn.Linear(self.n_feats, self.input_embedding_size) # Spatial Attention Path pipeline: a specific LSTM encoder + 2xConv/pool to process behavioral features if self.newFeats > 0 and self.use_spatial_attention: self.use_bidir = False # just to make code more simple # Similar pipeline than SOC but with specific weights self.ip_behav_emb = torch.nn.Linear(self.n_feats, self.input_embedding_size) self.enc_behav_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1) # Spatial Attention layer self.op_att1 = torch.nn.Linear(self.encoder_size, 10) self.op_att2 = torch.nn.Linear(10, 1) # Encoder LSTM if self.use_bidir: self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1, bidirectional=True) self.encoder_ndir = 2 else: self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1) self.encoder_ndir = 1 # Vehicle dynamics embedding self.dyn_emb = torch.nn.Linear(self.encoder_size, self.dyn_embedding_size) # Convolutional social pooling layer and social embedding layer self.soc_conv = torch.nn.Conv2d(self.encoder_size, self.soc_conv_depth, 3) self.soc_conv_3x1 = torch.nn.Conv2d(self.soc_conv_depth, self.conv_3x1_depth, (3, 1)) self.soc_maxpool = torch.nn.MaxPool2d((2, 1), padding=(1, 0)) # FC social pooling layer (for comparison): # self.soc_fc = torch.nn.Linear(self.soc_conv_depth * self.grid_size[0] * self.grid_size[1], (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth) if self.use_seq2seq or self.use_attention: # Decoder seq2seq LSTM (Attention buils on top of seq2seq) if self.use_maneuvers: self.proj_seq2seq = torch.nn.Linear( self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size) else: self.proj_seq2seq = torch.nn.Linear( self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size, self.decoder_size) if self.use_seq2seq: self.num_layers = 2 # XXX else: self.num_layers = 1 # XXX self.dec_seq2seq = torch.nn.LSTM(self.decoder_size, self.decoder_size, num_layers=self.num_layers) elif self.use_transformer is False: # Legacy Decoder LSTM if self.use_maneuvers: self.dec_lstm = torch.nn.LSTM( self.soc_embedding_size + self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size) else: self.dec_lstm = torch.nn.LSTM( self.soc_embedding_size + self.dyn_embedding_size, self.decoder_size) if self.use_attention: self.attn_densor1 = torch.nn.Linear( self.encoder_ndir * self.encoder_size + self.decoder_size, 10) self.attn_densor2 = torch.nn.Linear(10, 1) # Output layers: if self.use_transformer is False: self.op = torch.nn.Linear(self.decoder_size, 5) self.op_lat = torch.nn.Linear( self.soc_embedding_size + self.dyn_embedding_size, self.num_lat_classes) self.op_lon = torch.nn.Linear( self.soc_embedding_size + self.dyn_embedding_size, self.num_lon_classes) # Activations: self.leaky_relu = torch.nn.LeakyReLU(0.1) self.relu = torch.nn.ReLU() self.softmax = torch.nn.Softmax(dim=1)
batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) dev_dataset = NMTDataset(os.path.join(args.data, 'dev.en'), os.path.join(args.data, 'dev.zh'), src_sp, trg_sp) dev_dataloader = DataLoader(dev_dataset, batch_size=args.test_batch_size, shuffle=False, collate_fn=collate_fn) test_dataset = NMTDataset(os.path.join(args.data, 'test.en'), os.path.join(args.data, 'test.zh'), src_sp, trg_sp) test_dataloader = DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=False, collate_fn=collate_fn) model = make_model(src_vocab=train_dataset.src_vocabs_size, tgt_vocab=train_dataset.trg_vocabs_size, N=args.layers, d_model=args.d_model, d_ff=args.d_ff, h=args.heads, dropout=args.dropout) model = model.cuda() print('total #parameters: {}'.format( sum(p.numel() for p in model.parameters()))) writer = SummaryWriter(args.output_dir) train(train_dataloader, dev_dataloader, model, args, writer, trg_sp)
def main(): args = parser.parse_args() if args.multi_gpu: ngpus_per_node = torch.cuda.device_count() else: ngpus_per_node = 1 args.world_size = ngpus_per_node global best_acc1 args.gpu = args.local_rank torch.cuda.set_device(args.gpu) dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=args.gpu) # load dataset #sent_pairs = load_dataset_aihub(path='data/') sent_pairs = load_dataset_aihub() print_log('GPU#{} seeding with {}'.format(args.gpu, args.gpu)) # make dataloader with dataset # FIXME: RuntimeError: Internal: unk is not defined. inp_lang, out_lang = get_sentencepiece(src_prefix, trg_prefix) print_log('loaded input sentencepiece model: {}'.format(src_prefix)) print_log('loaded output sentencepiece model: {}'.format(trg_prefix)) # split train/valid sentence pairs n_train = int(len(sent_pairs) * 0.8) n_split = int(n_train * 1. / args.world_size) print_log(n_split * args.gpu, n_split * (args.gpu + 1)) train_sent_pairs = sent_pairs[:n_train] print_log('train_sent_pairs before split: {}'.format( len(train_sent_pairs))) # split train datset by GPU train_sent_pairs = train_sent_pairs[n_split * args.gpu:n_split * (args.gpu + 1)] train_sent_pairs = sorted(train_sent_pairs, key=lambda x: (len(x[0]), len(x[1]))) print_log('train_sent_pairs after split: {} --> GPU:{}'.format( len(train_sent_pairs), args.gpu)) valid_sent_pairs = sent_pairs[n_train:] print_log('valid_sent_pairs: {}'.format(len(valid_sent_pairs))) # these are used for defining tokenize method and some reserved words SRC = KRENField(pad_token='<pad>') TRG = KRENField(pad_token='<pad>') SRC.decode = inp_lang.DecodeIds TRG.decode = out_lang.DecodeIds SRC.encode = inp_lang.EncodeAsIds TRG.encode = out_lang.EncodeAsIds # load SRC/TRG if not os.path.exists('spm/{}.model'.format(src_prefix)) or \ not os.path.exists('spm/{}.model'.format(trg_prefix)): # build vocabulary SRC.build_vocab(train.src) TRG.build_vocab(train.trg) torch.save(SRC.vocab, 'spm/{}.spm'.format(src_prefix), pickle_module=dill) torch.save(TRG.vocab, 'spm/{}.spm'.format(trg_prefix), pickle_module=dill) print_log( 'input vocab was created and saved: spm/{}.spm'.format(src_prefix)) print_log('output vocab was created and saved: spm/{}.spm'.format( trg_prefix)) else: src_vocab = torch.load('spm/{}.spm'.format(src_prefix), pickle_module=dill) trg_vocab = torch.load('spm/{}.spm'.format(trg_prefix), pickle_module=dill) SRC.vocab = src_vocab TRG.vocab = trg_vocab print_log('input vocab was loaded: spm/{}.spm'.format(src_prefix)) print_log('output vocab was loaded: spm/{}.spm'.format(trg_prefix)) # make dataloader from KRENDataset train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG), inp_lang, out_lang, encoding_type='pieces') # output -> ['<s>', '▁', 'Central', '▁Asian', '▁c', 'u', 'is', ... '▁yesterday', '.', '</s>'] train_iter = MyIterator(train, batch_size=args.train_batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(valid, batch_size=args.valid_batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) # fix torch randomness fix_torch_randomness() # define input/output size args.inp_n_words = src_vocab_size args.out_n_words = trg_vocab_size print_log('inp_n_words: {} out_n_words: {}'.format(args.inp_n_words, args.out_n_words)) # define model model = make_model(args.inp_n_words, args.out_n_words, N=N, d_model=args.d_model, d_ff=args.d_ff, h=args.h, dropout=args.dropout) print_log('number of model parameters: {}'.format( get_number_of_params(model))) model.cuda() optimizer = get_std_opt(model, args.fp16) # initizlie model and optimizer for amp model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, #keep_batchnorm_fp32=args.keep_batchnorm_fp32, #loss_scale=args.loss_scale ) #optimizer.optimizer = opt if args.fp16: model = DDP(model, delay_allreduce=True) else: model = DDP(model, device_ids=[args.gpu]) # define model criterion = LabelSmoothing(size=args.out_n_words, padding_idx=0, smoothing=0.1) criterion.cuda() # initial best loss best_val_loss = np.inf # initialize visdom graph #vis_train = Visdom() #vis_valid = Visdom() #train_loss_list = [] #valid_loss_list = [] if args.gpu == 0: randidx = '{}'.format(np.random.randint(0, 10000)).zfill(4) model_name = 'transformer-s{}-t{}-b{}-n{}-md{}-ff{}-h{}-r{}.bin'.format( args.inp_n_words, # s : source vocab count args.out_n_words, # t : target vocab count args.train_batch_size, # b : batch size args.N, # n : number of layers args.d_model, # md : d_model args.d_ff, # ff : d_ff args.h, # h : hidden size randidx) # r : random number else: model_name = 'a.bin' print_log('model name to be saved: {}'.format( os.path.join(args.model_path, model_name))) for epoch in range(args.epochs): train_losses = train_epoch((rebatch(pad_id, b) for b in train_iter), model, criterion, optimizer, args.gpu, epoch, args.fp16) valid_loss = valid_epoch((rebatch(pad_id, b) for b in valid_iter), model, criterion, optimizer, epoch, args.fp16) sum_of_weight = sum( [p[1].data.sum() for p in model.named_parameters()]) print_log('GPU{} -> sum_of_weight={:.4f}'.format( args.gpu, sum_of_weight)) if args.gpu == 0: if valid_loss >= best_val_loss: print_log('Try again. Current best is still {:.4f} (< {:.4f})'. format(best_val_loss, valid_loss)) else: print_log('New record. from {:.4f} to {:.4f}'.format( best_val_loss, valid_loss)) best_val_loss = valid_loss save_model(args, model, optimizer, epoch, valid_loss, model_name=model_name) # blocking processes torch.distributed.barrier()
def main(args): src, tgt = load_data(args.path) src_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') src_vocab.load(os.path.join(args.path, 'vocab.en')) tgt_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') tgt_vocab.load(os.path.join(args.path, 'vocab.de')) sos_idx = 0 eos_idx = 1 pad_idx = 2 max_length = 50 src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) # Set hyper parameter device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = make_model(src_vocab_size, tgt_vocab_size).to(device) optimizer = get_std_opt(model) criterion = LabelSmoothing(size=tgt_vocab_size, padding_idx=pad_idx, smoothing=0.1) train_criterion = SimpleLossCompute(model.generator, criterion, optimizer) valid_criterion = SimpleLossCompute(model.generator, criterion, None) print('Using device:', device) if not args.test: train_loader = get_loader(src['train'], tgt['train'], src_vocab, tgt_vocab, batch_size=args.batch_size, shuffle=True) valid_loader = get_loader(src['valid'], tgt['valid'], src_vocab, tgt_vocab, batch_size=args.batch_size) best_loss = 987654321 for epoch in range(args.epochs): train_total_loss, valid_total_loss = 0.0, 0.0 start = time.time() total_tokens = 0 tokens = 0 model.train() # Train for src_batch, tgt_batch in train_loader: src_batch = torch.tensor(src_batch).to(device) tgt_batch = torch.tensor(tgt_batch).to(device) batch = Batch(src_batch, tgt_batch, pad_idx) prediction = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask) loss = train_criterion(prediction, batch.trg_y, batch.ntokens) train_total_loss += loss total_tokens += batch.ntokens tokens += batch.ntokens # Valid model.eval() for src_batch, tgt_batch in valid_loader: src_batch = torch.tensor(src_batch).to(device) tgt_batch = torch.tensor(tgt_batch).to(device) batch = Batch(src_batch, tgt_batch, pad_idx) prediction = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask) loss = valid_criterion(prediction, batch.trg_y, batch.ntokens) valid_total_loss += loss total_tokens += batch.ntokens tokens += batch.ntokens if valid_total_loss.item() < best_loss: best_loss = valid_total_loss best_model_state = model.state_dict() best_optimizer_state = optimizer.optimizer.state_dict() elpsed = time.time() - start print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "|| [" + str(epoch) + "/" + str(args.epochs) + "], train_loss = " + str(train_total_loss.item()) + ", valid_loss = " + str(valid_total_loss.item()) + ", Tokens per Sec = " + str(tokens.item() / elpsed)) tokens = 0 start = time.time() if epoch % 100 == 0: # Save model torch.save( { 'epoch': args.epochs, 'model_state_dict': best_model_state, 'optimizer_state': best_optimizer_state, 'loss': best_loss }, args.model_dir + "/intermediate.pt") print("Model saved") # Save model torch.save( { 'epoch': args.epochs, 'model_state_dict': best_model_state, 'optimizer_state': best_optimizer_state, 'loss': best_loss }, args.model_dir + "/best.pt") print("Model saved") else: # Load the model checkpoint = torch.load(args.model_dir + "/" + args.model_name, map_location=device) model.load_state_dict(checkpoint['model_state_dict']) optimizer.optimizer.load_state_dict(checkpoint['optimizer_state']) model.eval() print("Model loaded") # Test test_loader = get_loader(src['test'], tgt['test'], src_vocab, tgt_vocab, batch_size=args.batch_size) pred = [] for src_batch, tgt_batch in test_loader: src_batch = torch.tensor(src_batch).to(device) tgt_batch = torch.tensor(tgt_batch).to(device) batch = Batch(src_batch, tgt_batch, pad_idx) # Get pred_batch memory = model.encode(batch.src, batch.src_mask) pred_batch = torch.ones(src_batch.size(0), 1)\ .fill_(sos_idx).type_as(batch.src.data).to(device) for i in range(max_length - 1): out = model.decode( memory, batch.src_mask, Variable(pred_batch), Variable( Batch.make_std_mask(pred_batch, pad_idx).type_as(batch.src.data))) prob = model.generator(out[:, -1]) prob.index_fill_(1, torch.tensor([sos_idx, pad_idx]).to(device), -float('inf')) _, next_word = torch.max(prob, dim=1) pred_batch = torch.cat( [pred_batch, next_word.unsqueeze(-1)], dim=1) pred_batch = torch.cat([pred_batch, torch.ones(src_batch.size(0), 1)\ .fill_(eos_idx).type_as(batch.src.data).to(device)], dim=1) # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1). # every <pad> token (index: 2) should be located after <eos> token (index: 1). # example of pred_batch: # [[0, 5, 6, 7, 1], # [0, 4, 9, 1, 2], # [0, 6, 1, 2, 2]] pred += seq2sen(pred_batch.tolist(), tgt_vocab) with open('results/pred.txt', 'w', encoding='utf-8') as f: for line in pred: f.write('{}\n'.format(line)) os.system( 'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
for i in range(nbatches): data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10))) print(data) data = data.type(torch.LongTensor) data[:, 0] = 1 src = Variable(data, requires_grad=False) tgt = Variable(data, requires_grad=False) yield Batch(src, tgt, 0) # Train the simple copy task. V = 11 MODEL_SIZE = 10 Heads = 2 criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0) model = make_model(V, V, N=2, h=Heads, d_model=MODEL_SIZE, dropout=0.0) for p in model.parameters(): nn.init.ones_(p) model_opt = NoamOpt( model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9), ) nepoch = 5 # 10 batch_size = 2 # 30 nbatches = 2 # 20 nbatches_eval = 5
def __init__(self,params): super(highwayNet, self).__init__() ## Unpack arguments self.params = params ## Use gpu flag self.use_cuda = params.use_cuda # Flag for maneuver based (True) vs uni-modal decoder (False) self.use_maneuvers = params.use_maneuvers if params.use_grid == 2: self.use_grid_soc = True self.use_grid = False else: self.use_grid = params.use_grid self.use_grid_soc = False # Transformer architecture related self.use_transformer = params.use_transformer self.teacher_forcing_ratio = 0.0 # TODO ultimately set it in [0.9; 1.0] # RNN-LSTM Seq2seq architecture related self.use_bidir = params.use_bidir # NB: seq2seq uses a bidir encoder (always) self.use_seq2seq = params.use_seq2seq if self.use_seq2seq: self.use_bidir = True # RNN-LSTM with Attention architecture related self.use_attention = params.use_attention if self.use_attention: self.use_bidir = True # Flag for train mode (True) vs test-mode (False) self.train_flag = params.train_flag if self.train_flag is False: self.teacher_forcing_ratio = 0.0 ## Sizes of network layers self.encoder_size = params.encoder_size self.decoder_size = params.decoder_size self.in_length = params.in_length self.out_length = params.out_length self.grid_size = params.grid_size self.soc_conv_depth = params.soc_conv_depth self.conv_3x1_depth = params.conv_3x1_depth self.dyn_embedding_size = params.dyn_embedding_size self.input_embedding_size = params.input_embedding_size self.num_lat_classes = params.num_lat_classes self.num_lon_classes = params.num_lon_classes self.soc_embedding_size = (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth ## Define network weights # TRANSFORMER if self.use_transformer: src_feats = tgt_feats = 2 # (X,Y) point tgt_params = 5 # 5 params for bivariate Gaussian distrib if self.use_grid or self.use_grid_soc: src_ngrid = self.in_length # with soc else: src_ngrid = 0 # without soc if self.use_maneuvers: d_lon = self.num_lon_classes d_lat = self.num_lat_classes else: d_lon = 0 d_lat = 0 if self.use_grid_soc: self.transformer = tsf.make_model(src_feats, tgt_feats, tgt_params=tgt_params, src_ngrid=src_ngrid, src_lon=d_lon, src_lat=d_lat, src_soc_emb_size=self.soc_embedding_size) else: self.transformer = tsf.make_model(src_feats, tgt_feats, tgt_params=tgt_params, src_ngrid=src_ngrid, src_lon=d_lon, src_lat=d_lat) print("TRANSFORMER:", self.transformer) self.batch = tsf.Batch() # Input embedding layer self.ip_emb = torch.nn.Linear(2,self.input_embedding_size) # Encoder LSTM if self.use_bidir: self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1, bidirectional=True) self.encoder_ndir = 2 else: self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1) self.encoder_ndir = 1 # Vehicle dynamics embedding self.dyn_emb = torch.nn.Linear(self.encoder_size,self.dyn_embedding_size) # Convolutional social pooling layer and social embedding layer self.soc_conv = torch.nn.Conv2d(self.encoder_size,self.soc_conv_depth,3) self.conv_3x1 = torch.nn.Conv2d(self.soc_conv_depth, self.conv_3x1_depth, (3,1)) self.soc_maxpool = torch.nn.MaxPool2d((2,1),padding = (1,0)) self.grid_emb = torch.nn.Linear(5 ,self.input_embedding_size) # FC social pooling layer (for comparison): # self.soc_fc = torch.nn.Linear(self.soc_conv_depth * self.grid_size[0] * self.grid_size[1], (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth) if self.use_seq2seq or self.use_attention:# Decoder seq2seq LSTM (Attention buils on top of seq2seq) if self.use_maneuvers: self.proj_seq2seq = torch.nn.Linear(self.encoder_ndir * self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size) else: self.proj_seq2seq = torch.nn.Linear(self.encoder_ndir * self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size, self.decoder_size) if self.use_seq2seq: self.num_layers = 2 # XXX else: self.num_layers = 1 # XXX self.dec_seq2seq = torch.nn.LSTM(self.decoder_size, self.decoder_size, num_layers=self.num_layers) elif self.use_transformer is False: # Legacy Decoder LSTM if self.use_maneuvers: self.dec_lstm = torch.nn.LSTM(self.soc_embedding_size + self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size) else: self.dec_lstm = torch.nn.LSTM(self.soc_embedding_size + self.dyn_embedding_size, self.decoder_size) if self.use_attention: self.attn_densor1 = torch.nn.Linear(self.encoder_ndir * self.encoder_size + self.decoder_size, 10) self.attn_densor2 = torch.nn.Linear(10, 1) # Output layers: if self.use_transformer is False: self.op = torch.nn.Linear(self.decoder_size,5) self.op_lat = torch.nn.Linear(self.soc_embedding_size + self.dyn_embedding_size, self.num_lat_classes) self.op_lon = torch.nn.Linear(self.soc_embedding_size + self.dyn_embedding_size, self.num_lon_classes) # Activations: self.leaky_relu = torch.nn.LeakyReLU(0.1) self.relu = torch.nn.ReLU() self.softmax = torch.nn.Softmax(dim=1) #self.conv1 = torch.nn.Conv2d(self.in_length, 64, 3) # => [64, 11, 1] self.conv1 = torch.nn.Conv2d(16, 64, 3) # => [64, 11, 1] self.conv2 = torch.nn.Conv2d(64, 16, (3,1)) # => [16, 9, 1] self.maxpool = torch.nn.MaxPool2d((2,1),padding = (1,0)) # => [16, 5, 1] self.proj_grid = nn.Linear(self.encoder_size, self.soc_embedding_size)