def prepare_dataloaders_from_bpe_files(opt, device): batch_size = opt.batch_size MIN_FREQ = 2 if not opt.embs_share_weight: raise Exception("err") data = pickle.load(open(opt.data_pkl, 'rb')) MAX_LEN = data['settings'].max_len field = data['vocab'] fields = (field, field) def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN train = TranslationDataset(fields=fields, path=opt.train_path, exts=('.src', '.trg'), filter_pred=filter_examples_with_length) val = TranslationDataset(fields=fields, path=opt.val_path, exts=('.src', '.trg'), filter_pred=filter_examples_with_length) opt.max_token_seq_len = MAX_LEN + 2 opt.src_pad_idx = opt.trg_pad_idx = field.vocab.stoi[Constants.PAD_WORD] opt.src_vocab_size = opt.trg_vocab_size = len(field.vocab) train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True) val_iterator = BucketIterator(val, batch_size=batch_size, device=device) return train_iterator, val_iterator
def prepare_dataloaders(opt, device): batch_size = opt.batch_size data = pickle.load(open(opt.data_pkl, 'rb')) opt.max_token_seq_len = data['settings'].max_len opt.src_pad_idx = data['vocab']['src'].vocab.stoi[Constants.PAD_WORD] opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD] opt.src_vocab_size = len(data['vocab']['src'].vocab) opt.trg_vocab_size = len(data['vocab']['trg'].vocab) #========= Preparing Model =========# if opt.embs_share_weight: assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \ 'To sharing word embedding the src/trg word2idx table shall be the same.' fields = {'src': data['vocab']['src'], 'trg': data['vocab']['trg']} train = Dataset(examples=data['train'], fields=fields) val = Dataset(examples=data['valid'], fields=fields) train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True) val_iterator = BucketIterator(val, batch_size=batch_size, device=device) return train_iterator, val_iterator
def buildDataSets(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Model parameter MAX_SEQ_LEN = 16 PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.int8) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = {'label': ('label', label_field), 'text': ('text', text_field)} # TabularDataset train, valid, test = TabularDataset.splits(path='memesData/data', train='train.jsonl', validation='dev_unseen.jsonl', test='dev_seen.jsonl', format='JSON', fields=fields) # Iterators train_iter = BucketIterator(train, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=8, train=False, shuffle=False, sort=False) return train_iter, valid_iter, test_iter
def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, False) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) config_encoder = BertConfig(vocab_size=INPUT_DIM) config_decoder = BertConfig(vocab_size=OUTPUT_DIM) config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) config_encoder = model.config.encoder config_decoder = model.config.decoder config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=10, save_steps=3000, seed=0, load_best_model_at_end=True, ) # args.place_model_on_device = device trainer = Trainer( model=model, args=args, train_dataset=train_iterator, eval_dataset=valid_iterator, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() model.save_pretrained("bert2bert")
def data_iter(text_field, label_field, train_bs=32, eval_bs=128, is_char_token=False): train = MyDataset("data/tnews/train.json", text_field=text_field, label_field=label_field, test=False) valid = MyDataset("data/tnews/dev.json", text_field=text_field, label_field=label_field, test=False) test = MyDataset("data/tnews/test.json", text_field=text_field, label_field=None, test=True) # 如果是char粒度的输入,不存在预训练的词向量 word_embeddings = None if not is_char_token: word_embeddings = text_field.vocab.vectors # device=-1表示使用cpu进行数据集迭代 device = 0 if torch.cuda.is_available() else -1 train_iter = BucketIterator( dataset=train, batch_size=train_bs, shuffle=True, train=True, sort_key=lambda x: len(x.text), sort=False, device=device) val_iter = BucketIterator(valid, eval_bs, train=False, sort_key=lambda x: len(x.text), sort=False, device=device) test_iter = Iterator(test, 128, shuffle=False, train=False, sort=False, device=device) return train_iter, val_iter, test_iter, word_embeddings
def create_iterators(data_file_location, split_to_train_and_test=True): if split_to_train_and_test: train, test = TabularDataset(path=data_file_location, format="TSV", fields=fields, skip_header=True).split() train_iter = BucketIterator(train, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True) test_iter = BucketIterator(test, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True, sort=False) answer = train_iter, test_iter else: dataset = TabularDataset(path=data_file_location, format="TSV", fields=fields, skip_header=True) answer = BucketIterator(dataset, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True, sort=False) print("Finish dataset prepare") return answer
def TokenBucket( train, batch_size, device="cuda:0", key=lambda x: max(len(x.word[0]), 5) ): def batch_size_fn(x, _, size): return size + key(x) return BucketIterator( train, train=True, sort=False, sort_within_batch=True, shuffle=True, batch_size=batch_size, sort_key=lambda x: key(x), repeat=True, batch_size_fn=batch_size_fn, device=device, )
def make(config, device, train_data, valid_data, test_data, TTX, TRG, ASR): # Make the data train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), sort_key=lambda x: len(x.true_text), batch_size=config.batch_size, device=device) # Make the model model = make_model(config, device, TTX, TRG, ASR) print(f'The model has {count_parameters(model):,} trainable parameters') model.apply(initialize_weights) # Make the loss and optimizer optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) if config['decode_trg']: ignore_index = TRG.vocab.stoi[TRG.pad_token] else: ignore_index = TTX.vocab.stoi[TTX.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=ignore_index) return model, train_iterator, valid_iterator, test_iterator, criterion, optimizer
TEXT = Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) LABEL = LabelField(dtype=torch.long, use_vocab=False) fields = [('data', TEXT), ('label', LABEL)] train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) train_generator, val_generator, test_generator = BucketIterator.splits( (train, valid, test), batch_size=batch_size, device=device, sort=False) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) all_statedict_path = glob.glob('/root/logs/*.pth') for state_dict_path in all_statedict_path: print(state_dict_path) epoch_loss = 0 epoch_acc = 0 model = phobert_lstm(phobert_path=phobert_path, state_dict_path=state_dict_path, hidden_dim=hidden_dim, num_classes=num_classes,
test='test.csv', format='csv', fields=fields) len(train_data) , len(test_data) print(vars(train_data.examples[0])) Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d") Texto.vocab.freqs.most_common(25) Texto.vocab.itos[:10] train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=2, device=device ) class RNN_LSTM(nn.Module): def __init__(self, input_size, embed_size, hidden_size, num_layers): super(RNN_LSTM, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.embedding = nn.Embedding(input_size, embed_size) self.rnn = nn.LSTM(embed_size, hidden_size, num_layers) self.fc_out = nn.Linear(hidden_size, 1) def forward(self, x): # Set initial hidden and cell states h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import torch.optim as optim from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import seaborn as sns import flor flor.flags.NAME = 'kaggle-nlp-disasters-rnn' flor.flags.REPLAY = False device = torch.device(('cuda:0' if torch.cuda.is_available() else 'cpu')) device label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True) fields = [('words', text_field), ('target', label_field)] fields_test = [('words', text_field)] (train, valid) = TabularDataset.splits(path='data', train='train_rnn.csv', validation='valid_rnn.csv', format='CSV', fields=fields, skip_header=True) test = TabularDataset(path='data/test_rnn.csv', format='CSV', fields=fields_test, skip_header=True) train_iter = BucketIterator(train, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) test_iter = BucketIterator(test, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) text_field.build_vocab(train, min_freq=5) class LSTM(nn.Module): def __init__(self, dimension=128): super(LSTM, self).__init__() self.embedding = nn.Embedding(len(text_field.vocab), 300) self.dimension = dimension self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True) self.drop = nn.Dropout(p=0.5) self.fc = nn.Linear((2 * dimension), 1) def forward(self, text, text_len):
def make_iter(self, train, validate, test, batch_size, device): train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, validate, test), batch_size=batch_size, device=device) print('dataset initializing done') return train_iterator, valid_iterator, test_iterator
def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, config.net_params.transformer) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) if config.net_params.pretrained_emb: src_vectors = torchtext.vocab.FastText(language='ru') SRC.build_vocab(train_data, min_freq=3) if config.net_params.pretrained_emb: SRC.vocab.load_vectors(src_vectors) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) if config.net_params.attention: Encoder = network_gru_attention.Encoder Decoder = network_gru_attention.Decoder Seq2Seq = network_gru_attention.Seq2Seq Attention = network_gru_attention.Attention attn = Attention(config.net_params.HID_DIM, config.net_params.HID_DIM) enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM, config.net_params.HID_DIM, config.net_params.HID_DIM, config.net_params.ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM, config.net_params.HID_DIM, config.net_params.HID_DIM, config.net_params.DEC_DROPOUT, attn) model = Seq2Seq(enc, dec, device) if config.net_params.transformer: logger.info("Transformer lets go") Encoder = network_transformer.Encoder Decoder = network_transformer.Decoder Seq2Seq = network_transformer.Seq2Seq SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] HID_DIM = 512 ENC_LAYERS = 6 DEC_LAYERS = 6 ENC_HEADS = 8 DEC_HEADS = 8 ENC_PF_DIM = 2048 DEC_PF_DIM = 2048 ENC_DROPOUT = 0.1 DEC_DROPOUT = 0.1 enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device) dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device) model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device) if not config.net_params.attention and not config.net_params.transformer: Encoder = my_network.Encoder Decoder = my_network.Decoder Seq2Seq = my_network.Seq2Seq enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM, config.net_params.HID_DIM, config.net_params.N_LAYERS, config.net_params.ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM, config.net_params.HID_DIM, config.net_params.N_LAYERS, config.net_params.DEC_DROPOUT) model = Seq2Seq(enc, dec, device) model.apply(init_weights) if config.net_params.pretrained_emb: model.encoder.tok_embedding = nn.Embedding.from_pretrained( torch.FloatTensor(SRC.vocab.vectors)) model.to(device) PAD_IDX = TRG.vocab.stoi[TRG.pad_token] optimizer = optim.Adam(model.parameters(), config.lr) criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, **config.lr_scheduler.__dict__) train_history = [] valid_history = [] best_valid_loss = float('inf') print("Let's go") # for p in model.encoder.parameters(): # p.requires_grad = True # for p in model.decoder.parameters(): # p.requires_grad = True for epoch in range(config.N_EPOCHS): start_time = time.time() train_loss = train(model, train_iterator, optimizer, criterion, config.CLIP, train_history, valid_history) valid_loss = evaluate(model, valid_iterator, criterion) lr_scheduler.step(valid_loss) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), config.model_out_name) train_history.append(train_loss) valid_history.append(valid_loss) writer.add_scalar('train loss', train_history[-1], epoch) writer.add_scalar('valid loss', valid_history[-1], epoch) writer.add_scalar('learning rate', lr_scheduler._last_lr[0], epoch) print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}' ) for idx, batch in enumerate(valid_iterator): if idx > 3: break src = batch.src[:, idx:idx + 1] trg = batch.trg[:, idx:idx + 1] generate_translation(src, trg, model, TRG.vocab, SRC.vocab, config.net_params.transformer) get_bleu(model, test_iterator, TRG, config.net_params.transformer)
def data_preprocessing(): SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True # import de_core_news_sm, en_core_web_sm # spacy_de = de_core_news_sm.load() # spacy_en = en_core_web_sm.load() # spacy_de = spacy.load('de_core_news_sm') # spacy_en = spacy.load('en_core_web_sm') # Field对象 :指定要如何处理某个字段,比如指定分词方法,是否转成小写,起始字符,结束字符,补全字符以及词典等。 # 我们创建SRC和TRG两个Field对象,tokenize为我们刚才定义的分词器函数 # 在每句话的开头加入字符SOS,结尾加入字符EOS,将所有单词转换为小写。 SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) # splits方法可以同时加载训练集,验证集和测试集, # 参数exts指定使用哪种语言作为源语言和目标语言,fileds指定定义好的Field类 train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) # print(f"Number of training examples: {len(train_data.examples)}") # print(f"Number of validation examples: {len(valid_data.examples)}") # print(f"Number of testing examples: {len(test_data.examples)}") # vars() 函数返回对象object的属性和属性值的字典对象。 # print(vars(train_data.examples[0])) # 构建词表,即给每个单词编码,用数字表示每个单词,这样才能传入模型 SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) # print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") # print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # print(device) BATCH_SIZE = 128 # BucketIterator:相比于标准迭代器,会将类似长度的样本当做一批来处理 # 因为在文本处理中经常会需要将每一批样本长度补齐为当前批中最长序列的长度 # 因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高。 # 除此之外,我们还可以在Field中通过fix_length参数来对样本进行截断补齐操作。 # 当使用迭代器生成一个batch时,我们需要确保所有的源语言句子都padding到相同的长度,目标语言的句子也是。 # 这些功能torchtext可以自动的完成,其使用了动态padding,意味着一个batch内的所有句子会pad成batch内最长的句子长度。 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) return SRC, TRG, device, train_iterator, valid_iterator, test_iterator
def train(): spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) ### We're ready to define everything we need for training our Seq2Seq model ### # Training hyperparameters num_epochs = 20 learning_rate = 0.001 batch_size = 64 # Model hyperparameters load_model = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 # Needs to be the same for both RNN's num_layers = 2 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard to get nice loss plot writer = SummaryWriter(f"runs/loss_plot") step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) decoder_net = Decoder( input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout, ).to(device) model = Seq2Seq(encoder_net, decoder_net, len(english.vocab), device).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters" ) pad_idx = english.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) if load_model: load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model, optimizer) sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen." for epoch in range(num_epochs): print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]" ) checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() } # save_checkpoint(checkpoint) model.eval() translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50) print(f"Translated example sentence: \n {translated_sentence}") model.train() for batch_idx, batch in enumerate(train_iterator): # Get input and targets and get to cuda inp_data = batch.src.to(device) target = batch.trg.to(device) # Forward prop output = model(inp_data, target) # print('\n') # print('Input', inp_data.shape) # print('Target', target.shape) # print('Output', output.shape) # print('---------------------') # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss # doesn't take input in that form. For example if we have MNIST we want to have # output to be: (N, 10) and targets just (N). Here we can view it in a similar # way that we have output_words * batch_size that we want to send in into # our cost function, so we need to do some reshapin. While we're at it # Let's also remove the start token while we're at it output = output[1:].reshape(-1, output.shape[2]) target = target[1:].reshape(-1) optimizer.zero_grad() loss = criterion(output, target) # Back prop loss.backward() # Clip to avoid exploding gradient issues, makes sure grads are # within a healthy range torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # Gradient descent step optimizer.step() # Plot to tensorboard writer.add_scalar("Training loss", loss, global_step=step) # print("Training loss", loss) step += 1 score = bleu(test_data[1:100], model, german, english, device) print(f"Bleu score {score*100:.2f}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # In[ ]: def _len_sort_key(x): return len(x.src) BATCH_SIZE = 128 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device, sort_key=_len_sort_key ) # ## Building the Model # # Next, we'll build the model. Like previous notebooks it is made up of an *encoder* and a *decoder*, with the encoder *encoding* the input/source sentence (in Russian) into *context vector* and the decoder then *decoding* this context vector to output our output/target sentence (in English). # # ### Encoder # # The Transformer's encoder does not attempt to compress the entire source sentence, $X = (x_1, ... ,x_n)$, into a single context vector, $z$. Instead it produces a sequence of context vectors, $Z = (z_1, ... , z_n)$. So, if our input sequence was 5 tokens long we would have $Z = (z_1, z_2, z_3, z_4, z_5)$. Why do we call this a sequence of context vectors and not a sequence of hidden states? A hidden state at time $t$ in an RNN has only seen tokens $x_t$ and all the tokens before it. However, each context vector here has seen all tokens at all positions within the input sequence. # # ![](assets/transformer-encoder.png) # # First, the tokens are passed through a standard embedding layer. Next, as the model has no recurrent it has no idea about the order of the tokens within the sequence. We solve this by using a second embedding layer called a *positional embedding layer*. This is a standard embedding layer where the input is not the token itself but the position of the token within the sequence, starting with the first token, the `<sos>` (start of sequence) token, in position 0. The position embedding has a "vocabulary" size of 100, which means our model can accept sentences up to 100 tokens long. This can be increased if we want to handle longer sentences.
rootPath = root_path train_path = 'train.csv' test_path = 'test.csv' train_data_pt, devdata = data.TabularDataset.splits(path=rootPath, train=train_path, test=test_path, fields=fieldsMy, skip_header=True, format='csv') #print(train_data_pt.review, devdata) text_field.build_vocab(train_data_pt.review, max_size=3000) train_iter, dev_iter = BucketIterator.splits((train_data_pt, devdata), sort_key=lambda x: len(x.review), batch_sizes=(128, 256), sort=False, sort_within_batch=False, repeat=False) def save_vocab(path): file = open(path, 'w+', encoding='utf-8') for i in range(len(text_field.vocab)): #print(text_field.vocab.itos[i]+'\n') file.write(text_field.vocab.itos[i] + '\n') file.close() save_vocab(root_path + 'vocab.txt')
x=target[0] #autoregression / teacher forcing for t in range(1,seq_len): output,h,c=self.decoder(x,states,h,c) outputs[t]=output best_guess=output.argmax(1) x=target[t] if random.random()<ratio else best_guess return outputs epochs=60 encoder_vocab=len(german.vocab) decoder_vocab=len(english.vocab) embed_encoder=300 embed_decoder=300 TrainD,ValD,TestD=BucketIterator.splits((Train,Val,Test),batch_size=64,sort_within_batch=True,sort_key=lambda x:len(x.src),device=device) for a,b in TrainD: print(a[0].shape,a[1].shape) encoder_net=Encoder(encoder_vocab,embed_encoder,256).to(device) decoder_net=AttentionDecoder(decoder_vocab,embed_decoder,256,decoder_vocab).to(device) Sequence_net=Encoder_Decoder(encoder_net,decoder_net).to(device) pad_index=english.vocab.stoi['<pad>'] optimizer=opt.Adam(Sequence_net.parameters()) loss_f=CrossEntropyLoss(ignore_index=pad_index) for i in range(epochs):
format='csv', skip_header=True, fields=datafields) print(f"Number of {data_size} training examples: {len(trn.examples)}") print(f"Number of {data_size} validation examples: {len(vld.examples)}") print(f"Number of {data_size} test examples: {len(tst.examples)}") INPUT.build_vocab(trn) TARGET.build_vocab(trn) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iter, val_iter, test_iter = BucketIterator.splits( (trn, vld, tst), sort_key=lambda x: len(x.input), sort_within_batch=False, batch_size=BATCH_SIZE, device=device) """ Build Transformer """ class TransformerModel(nn.Module): def __init__(self, model_type, intoken, outtoken, hidden, enc_layers=3, dec_layers=1,
class DataFrameDataset(torchtext.legacy.data.Dataset): def __init__(self, df: pd.DataFrame, fields: list): super(DataFrameDataset, self).__init__( [Example.fromlist(list(r), fields) for i, r in df.iterrows()], fields) train_dataset, test_dataset = DataFrameDataset(df=df, fields=(('text', text_field), ('label', label_field))).split() with open(data_dir + 'n_labels.pkl', 'rb') as f: n_classes = pkl.load(f) train_iter, test_iter = BucketIterator.splits(datasets=(train_dataset, test_dataset), batch_sizes=(32, n_classes), sort=False) class ModelParam(object): def __init__(self, param_dict: dict = dict()): self.input_size = param_dict.get('input_size', 0) self.vocab_size = param_dict.get('vocab_size') self.embedding_dim = param_dict.get('embedding_dim', 300) self.target_dim = param_dict.get('target_dim', n_classes) class MyModel(nn.Module): def __init__(self, model_param: ModelParam): super().__init__() self.embedding = nn.Embedding(model_param.vocab_size,
# TabularDataset train, valid, test = TabularDataset.splits(path='./data', train='IMDB_single.csv', validation='IMDBs.csv', test='IMDBs.csv', format='CSV', fields=fields, skip_header=True) # Iterators train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text), device=device, train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.text), device=device, train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False,
batch_first=True) TGT = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TGT)) SRC.build_vocab(train_data, min_freq=2) TGT.build_vocab(train_data, min_freq=2) BATCH_SIZE = 8 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE) if __name__ == "__main__": syn_data = synthetic_data(8, 2, 1) for i, batch in enumerate(syn_data): logging.info("batch-src shape {}, batch-src: {}".format( batch.src.shape, batch.src)) logging.info("batch-tgt shape {}, batch-tgt: {}".format( batch.tgt.shape, batch.tgt)) logging.info("batch-src-mask shape {}, batch-src-mask: {}".format( batch.src_mask.shape, batch.src_mask)) logging.info("batch-tgt-mask shape {}, batch-src-mask: {}".format( batch.tgt_mask.shape, batch.tgt_mask)) logging.info("batch-tgt-y shape {}, trg_y: {}".format( batch.tgt_y.shape, batch.tgt_y))
def train_adv(self, train_loader, val_loader, test_loader): """ Adversarially training process""" ## Step1. Train detector and generator best_model = os.path.join(self.args.savepath, 'best_model.pt') if not os.path.exists(best_model): self.forward(train_loader, val_loader, test_loader, maxepoch=3) logger.info('Loading pre-trained genearator and detector') self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model']) ## Step2. Train adv generator and fix detector # Load checkpoint best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt') if os.path.exists(best_adv_model): logger.info('Loading pre-trained adv genearator') self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, loc: storage)['model']) else: logger.info('Adversarially train generator -----------------------') train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer, reverse_label=True) val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer, reverse_label=True) train_iter, val_iter = BucketIterator.splits((train, val), sort_key=lambda x: len(x.src), sort_within_batch=False, batch_size=self.bs, device=self.device) # 3906, 977 test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer) test_iter = Iterator(test, train=False, device=self.device, batch_size=self.bs, sort_key=lambda x: len(x.src), sort_within_batch=False) # Define trainer train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True, self.args.train_gen ,self.model.generator, self.symbols, self.model.vocab_size, self.args.label_smoothing) trainer = build_trainer(self.args, self.model, self.optim, train_loss, wandb=self.wandb) tot_train_steps = self.maxepoch * len(train_iter) test_stats = trainer.testing(test_iter, -1, gen_flag=True) test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'Before', self.args.label_num) # Freeze the detector for param in self.model.parameters(): param.requires_grad = False for param in self.model.decoder.parameters(): param.requires_grad = True best_total_loss = 100 lowest_acc = 1 stop_count = 0 for epoch in range(self.maxepoch): print('[Adv train generator - fold {}] {}/{}'.format(self.fold, epoch, self.maxepoch)) message = '{}-{} epoch {}/{} adv gen'.format(self.dataname, self.fold, epoch, 4) trainer.train(train_iter, tot_train_steps, message) val_stats = trainer.validate(val_iter, epoch) test_stats = trainer.testing(test_iter, epoch, gen_flag=False) test_stats = trainer.testing(test_iter, epoch, gen_flag=True) # polluted data test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), '{}-att'.format(epoch), self.args.label_num) #Save best model if test_stats.det_acc() < lowest_acc: logger.info('Save Adv model at epoch {}'.format(epoch)) lowest_acc = test_stats.det_acc() trainer._save('best_adv') stop_count = 0 else: stop_count += 1 if stop_count == 3: break ## Step3. Train adv detector and fix generator logger.info('Adversarially train detector ---------------------------') train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer) val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer) train_iter, val_iter = BucketIterator.splits((train, val), sort_key=lambda x: len(x.src), sort_within_batch=False, batch_size=self.bs, device=self.device) # 3906, 977 test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer) test_iter = Iterator(test, train=False, device=self.device, batch_size=self.bs, sort_key=lambda x: len(x.src), sort_within_batch=False) # Load checkpoint best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt') self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, loc: storage)['model']) # Freeze the generator for param in self.model.parameters(): param.requires_grad = False for param in self.model.classifier.parameters(): param.requires_grad = True for param in self.model.bert.parameters(): param.requires_grad = True # Define trainer optim = [build_optim(self.args, self.model, None)] train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True, False ,self.model.generator, self.symbols, self.model.vocab_size, self.args.label_smoothing) trainer = build_trainer(self.args, self.model, optim, train_loss, wandb=self.wandb) test_stats = trainer.testing(test_iter, -1, gen_flag=False) # clean data test_stats = trainer.testing(test_iter, -1, gen_flag=True) # polluted data tot_train_steps = self.maxepoch * len(train_iter) best_xent = 10 stop_count = 0 for epoch in range(self.maxepoch): print('[Adv train detector] {}/{}'.format(epoch, self.maxepoch)) # Freeze the detector message = '{}-{} epoch {}/{} adv det'.format(self.dataname, self.fold, epoch, 5) trainer.train(train_iter, tot_train_steps, message) val_stats = trainer.validate(val_iter, epoch) test_stats = trainer.testing(test_iter, epoch, gen_flag=False) # clean data test_stats = trainer.testing(test_iter, epoch, gen_flag=True) # polluted data test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), '{}-adv'.format(epoch), self.args.label_num) # Save best model if val_stats.det_xent() < best_xent: print('Save model at epoch {}'.format(epoch)) trainer._save('best_final') best_xent = val_stats.det_xent() stop_count = 0 else: stop_count += 1 if stop_count == 3: break
load_model = False device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 num_layers = 2 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard writer = SummaryWriter(f'runs/loss_plot') step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src)) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) encoder_net = Encoder(input_size_decoder, decoder_embedding_size, hidden_size, num_layers, dec_dropout).to(device) pad_idx = english.vocab.stoi['<pad>'] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) ## complete training loop
def forward(self, train_loader, val_loader, test_loader, maxepoch=None): """ Normal training process """ # Build data iterator for generator (One node on the tree is converted to one data) #train = TreeDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer) train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer) sample_per_cls = None #sample_per_cls = train.sample_per_cls(self.args.label_num) train_iter = BucketIterator(train, sort_key=lambda x: len(x.src), sort_within_batch=False, batch_size=self.bs, device=self.device) #val = TreeDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer) val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer) val_iter = BucketIterator(val, sort_key=lambda x: len(x.src), sort_within_batch=False, batch_size=96, device=self.device) test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer) test_iter = Iterator(test, train=False, device=self.device, batch_size=96, sort_key=lambda x: len(x.src), sort_within_batch=False) # Define trainer if self.args.train_gen: train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True, self.args.train_gen ,self.model.generator, self.symbols, self.model.vocab_size, self.args.label_smoothing, sample_per_cls=sample_per_cls) else: train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True, sample_per_cls = sample_per_cls) trainer = build_trainer(self.args, self.model, self.optim, train_loss, wandb=self.wandb) # Start training best_loss = 10 stop_count = 0 tot_train_steps = self.maxepoch * len(train_iter) gen_flag = self.args.train_gen or self.args.test_gen if not maxepoch: maxepoch = self.maxepoch logger.info('Start training') for epoch in range(maxepoch): print('[Training] {}/{}'.format(epoch, self.maxepoch)) if self.args.train_gen: job = 'gen det' else: job = 'det' message = '{}-{} epoch {} {}/{} '.format(self.dataname, self.fold, job, epoch, self.maxepoch) trainer.train(train_iter, tot_train_steps, message) val_stats = trainer.validate(val_iter, epoch) test_stats = trainer.testing(test_iter, epoch, gen_flag=gen_flag) test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), str(epoch), self.args.label_num) val_det_loss = val_stats.det_xent() # Save best model if val_det_loss < best_loss: print('Save model at epoch {}'.format(epoch)) trainer._save('best') best_loss = val_det_loss stop_count = 0 else: stop_count += 1
path="data", train="train_rnn.csv", validation="valid_rnn.csv", format="CSV", fields=fields, skip_header=True, ) test = TabularDataset(path="data/test_rnn.csv", format="CSV", fields=fields_test, skip_header=True) train_iter = BucketIterator( train, batch_size=flor.log("batch_size", 200), sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) valid_iter = BucketIterator( valid, batch_size=200, sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) test_iter = BucketIterator( test, batch_size=200, sort_key=lambda x: len(x.words),
Question.build_vocab(train_data, min_freq=2) Answer.build_vocab( train_data, vectors=torchtext.vocab.Vectors("./python_code_glove_embedding_300.txt"), min_freq=2) print(f"Unique tokens in Question vocabulary: {len(Question.vocab)}") print(f"Unique tokens in Answer vocabulary: {len(Answer.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 32 print('BATCH_SIZE:', 32) train_iterator, valid_iterator = BucketIterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, sort=False, device=device) INPUT_DIM = len(Question.vocab) OUTPUT_DIM = len(Answer.vocab) HID_DIM = 300 ENC_LAYERS = 4 DEC_LAYERS = 4 ENC_HEADS = 5 DEC_HEADS = 5 ENC_PF_DIM = 512 DEC_PF_DIM = 512 ENC_DROPOUT = 0.1 DEC_DROPOUT = 0.1 enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)