def load_data(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"): # prepare dataset print("Reading data...") train = Seq2SeqDataset.from_file(train_src, train_tgt) print("Building vocab...") train.build_vocab(max_size=300) val = Seq2SeqDataset.from_file(val_src, val_tgt, share_fields_from=train) src_vocab = train.src_field.vocab tgt_vocab = train.tgt_field.vocab # save vocab with open(os.path.join(save_path, "vocab.src"), "wb") as f: dill.dump(src_vocab, f) with open(os.path.join(save_path, "vocab.tgt"), "wb") as f: dill.dump(tgt_vocab, f) print("Source vocab size:", len(src_vocab)) print("Target vocab size:", len(tgt_vocab)) # data iterator # keep sort=False and shuffle=False to speed up training and reduce memory usage train_iterator = BucketIterator(dataset=train, batch_size=batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=False, device=device) val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=False, device=device) return src_vocab, tgt_vocab, train_iterator, val_iterator
def make_data(self): train_dataset = Seq2SeqDataset(self._config.train_path) dev_dataset = Seq2SeqDataset(self._config.dev_path) train_loader = DataLoader(dataset=train_dataset, batch_size=self._config.batch_size, shuffle=True, pin_memory=True) dev_loader = DataLoader(dataset=dev_dataset, batch_size=self._config.batch_size, shuffle=False, pin_memory=True) return train_loader, dev_loader
def _make_data(self): train_dataset = Seq2SeqDataset(self._config.train_path) dev_dataset = Seq2SeqDataset(self._config.dev_path) train_loader = DataLoader(train_dataset, self._config.batch_size, shuffle=True, num_workers=2) dev_loader = DataLoader(dev_dataset, self._config.batch_size, shuffle=False, num_workers=2) return train_loader, dev_loader
def __init__(self, input_size, hidden_size, batch_size, learning_rate, num_epoch, method): dataset = Seq2SeqDataset() self.vocab = sorted(set(dataset.full_text)) self.vocab_size = len(self.vocab) self.char2ind, self.ind2char = self.get_vocab() self.input_size = input_size self.hidden_size = hidden_size self.output_size = self.vocab_size self.method = method self.learning_rate = learning_rate self.batch_size = batch_size self.num_epoch = num_epoch self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) self.encoder = Encoder(input_size, hidden_size, self.vocab_size) self.decoder = Decoder(hidden_size, self.output_size, method) self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) self.loss_function = NLLLoss() self.encoder_optim = optim.Adam(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optim = optim.Adam(self.decoder.parameters(), lr=self.learning_rate)
def create_seq2seq_dataset(samples, save_path, padding=0): dataset = Seq2SeqDataset(samples, padding=padding, max_text_len=300, max_summary_len=80) with open(save_path, 'wb') as f: pickle.dump(dataset, f)
def train_in_parts(self, train_parts, val, val_iterator, batch_size, start_epoch=0, print_every=100): for epoch in range(start_epoch, self.n_epochs): # shuffle data each epoch random.shuffle(train_parts) for train_src_, train_tgt_ in train_parts: # create train dataset print("Training part [{}] with target [{}]...".format(train_src_, train_tgt_)) train_ = Seq2SeqDataset.from_file(train_src_, train_tgt_, share_fields_from=val) # create iterator train_iterator_ = BucketIterator(dataset=train_, batch_size=batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=True, device=device) # train self._train_epoch(epoch, train_iterator_, train=True, print_every=print_every) # clean del train_ del train_iterator_ gc.collect() # save self.save(epoch) # evaluate on validation set after each epoch with torch.no_grad(): self._train_epoch(epoch, val_iterator, train=False, print_every=print_every)
def create_seq2seq_dataset_without_save(samples, config, padding=0): dataset = Seq2SeqDataset( samples, padding=padding, max_text_len=config.get('max_text_len') or 300, max_summary_len=config.get('max_summary_len') or 80, train=False ) return dataset
def infer(self, test_sentence): # read raw data to list test_sentence = self.convert(test_sentence) print(test_sentence) lines_raw = [test_sentence] lines_prep = [self.preprocess(test_sentence)] # prepare dataset print("Reading test data...") test = Seq2SeqDataset.from_list(lines_prep) test.src_field.vocab = self.src_vocab # prepare iterator test_iterator = BucketIterator(dataset=test, batch_size=1, train=False, sort=False, sort_within_batch=False, shuffle=False, device=device) # predict with torch.no_grad(): for i, batch in enumerate(test_iterator): # forward through model _, _, output = self.model(batch, has_targets=False, mask_softmax=1.0, teacher_forcing=1.0) # get top-1 predicted_values, predicted_indices = torch.max(output, dim=-1) # convert predicted vocab indices to an actual sentence predicted_seq = [ self.tgt_vocab.itos[c] for c in predicted_indices.squeeze(0).tolist() ] # output is log_softmax so do exp() predicted_values = predicted_values.exp() # convert to list predicted_values_ = predicted_values.squeeze(0).tolist() # beam search predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]), predicted_values_[1:-1], lines_raw[i]) # match case and punctuations predicted_seq = self.match_case(predicted_seq, lines_raw[i]) # do some post-processing to match submission output print("{} {}".format(i, predicted_seq)) return predicted_seq
def train(args): args.save_dir += "_" + args.model_type + "_lm" if not args.seq2seq else "_seq2seq" os.makedirs(args.save_dir, exist_ok=True) device = "cuda" if torch.cuda.is_available() else "cpu" if args.model_type == "lstm": from lstm import LMModel, Seq2SeqModel elif args.model_type == "transformer": from transformer import LMModel, Seq2SeqModel if args.seq2seq: train_set = Seq2SeqDataset(device=device) valid_set = Seq2SeqDataset(split="valid", device=device) model = Seq2SeqModel(args, train_set.dictionary).to(device) else: train_set = LMDataset(device=device) valid_set = LMDataset(split="valid", device=device) model = LMModel(args, train_set.dictionary).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True) evaluate(model, valid_set) for epoch in range(args.num_epoch): model.train() with tqdm(train_loader, desc="training") as pbar: losses = [] for samples in pbar: optimizer.zero_grad() loss = model.get_loss(**samples) loss.backward() optimizer.step() losses.append(loss.item()) pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) if epoch % args.save_interval == 0: torch.save(model, args.save_dir + "/{}_{}.pt".format(args.model_type, epoch + 1)) evaluate(model,valid_set)
def val_dataloader(self): dataset = Seq2SeqDataset(data_files=Path( self.test_folder).glob("*.npz"), previous_poses=self.previous_poses, predicted_poses=self.predicted_poses, stride=self.stride, with_context=True, text_folder=self.text_folder, vocab=self.vocab) loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=dataset.collate_fn) return loader
def train_dataloader(self): dataset = Seq2SeqDataset( Path(self.train_folder).glob("*.npz"), self.previous_poses, self.predicted_poses, self.stride, self.with_context, text_folder=self.text_folder, vocab=self.vocab ) loader = DataLoader( dataset, batch_size=self.batch_size, shuffle=True, collate_fn=dataset.collate_fn ) return loader
def __init__(self, input_size, hidden_size, batch_size, learning_rate, method, num_layers=1): dataset = Seq2SeqDataset() self.data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) self.vocab = dataset.vocab self.output_size = len(self.vocab) self.char2index, self.index2char = self.data_index() self.input_size = input_size self.hidden_size = hidden_size self.batch_size = batch_size self.learning_rate = learning_rate self.num_layers = 1 self.method = method self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' self.attn = Attn(method, hidden_size) self.encoder = Encoder(input_size, hidden_size, self.output_size, self.num_layers) self.decoder = Decoder(hidden_size, self.output_size, method, self.num_layers) self.attn = self.attn.to(self.device) self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) self.loss_function = NLLLoss() self.encoder_optim = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optim = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate)
from dataset import Seq2SeqDataset if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--checkpoint", type=str, required=True) parser.add_argument("--dest", type=str, required=True) parser.add_argument("--src", type=str) parser.add_argument("--text_folder", type=str) args = parser.parse_args() system = Seq2SeqSystem.load_from_checkpoint( args.checkpoint, train_folder=None, test_folder="data/dataset/test") system = system.eval().cuda() dataset = Seq2SeqDataset([Path(args.src)], previous_poses=system.previous_poses, predicted_poses=system.predicted_poses, stride=system.predicted_poses, with_context=system.with_context, text_folder=args.text_folder, vocab=system.vocab) prev_poses = system.predicted_poses pred_poses = system.previous_poses all_predictions = [] dataset_iter = iter(dataset) x, y, p, w = next(dataset_iter) x = x.unsqueeze(1).cuda() p = p.unsqueeze(1).cuda() w = w.unsqueeze(1).cuda() pose = system(x, p, w) all_predictions.append(pose.squeeze(1).detach().cpu().numpy())
with open(trainingName, "rb") as FileTraining: #print(sys.argv[1]) trainingData = pickle.load(FileTraining) """ with open(validName,"rb") as FileValidating: #print(sys.argv[1]) validData = pickle.load(FileValidating) """ """ with open("data/valid.jsonl","r") as f: answers = [json.loads(line) for line in f] answers = {a['id']: a for a in answers} """ trainingData = Seq2SeqDataset(trainingData) #validData = Seq2SeqDataset(validData) with open(embeddingName, 'rb') as f: embedding = pickle.load(f) encoder = EncoderRNN(len(embedding.vocab), hidden_size, embedding.vectors, BATCH_SIZE).to(device) decoder = DecoderRNN(hidden_size, len(embedding.vocab), embedding.vectors, BATCH_SIZE).to(device) loader = Data.DataLoader( dataset=trainingData, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=True, # 要不要打乱数据 (打乱比较好) #num_workers=1, # 多线程来读数据
def generate(args): model_class, tokenizer_class = register(args.model_class) if args.score_reference: args.batch_size = 1 test_dataset = Seq2SeqDataset( tokenizer_class=tokenizer_class, tokenizer_path=args.save_dir, source_data_path=args.test_source_data_path, target_data_path=args.test_target_data_path ) else: test_dataset = Seq2SeqDataset( tokenizer_class=tokenizer_class, tokenizer_path=args.save_dir, source_data_path=args.test_source_data_path ) test_dataloader = test_dataset.get_dataloader(batch_size=args.batch_size, shuffle=False) model = model_class.from_pretrained(args.save_dir) model.to(DEVICE) model.eval() if not args.debug: num_batches = math.ceil(len(test_dataset) / args.batch_size) widgets = [ progressbar.Percentage(), ' | ', progressbar.SimpleProgress(), ' ', progressbar.Bar('▇'), ' ', progressbar.Timer(), ' | ', progressbar.ETA() ] progress = progressbar.ProgressBar( max_value=num_batches, widgets=widgets, redirect_stdout=True ).start() output_file = open(args.output_path, 'w') for itr, data in enumerate(test_dataloader): if args.score_reference: src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data) else: src_input_ids, src_attn_mask = (x.to(DEVICE) for x in data) if args.score_reference: labels = shift_target_inputs_to_labels(tgt_input_ids, test_dataset.tokenizer.pad_token_id) with torch.no_grad(): output = model( src_input_ids, attention_mask=src_attn_mask, decoder_input_ids=tgt_input_ids, decoder_attention_mask=tgt_attn_mask, labels=labels ) score = output[0].item() output_file.write(str(score) + '\n') else: with torch.no_grad(): tgt_output_ids = model.generate( src_input_ids, attention_mask=src_attn_mask, num_beams=args.beam_size, num_return_sequences=args.num_return_sequences, max_length=args.max_length ) for seq_ids in tgt_output_ids.to('cpu').numpy().tolist(): seq_toks = test_dataset.tokenizer.decode( seq_ids, skip_special_tokens=True, clean_up_tokenization_spaces=args.clean_up_tokenization_spaces ) output_file.write(seq_toks + '\n') if not args.debug: progress.update(itr+1) if not args.debug: progress.finish() output_file.close()
embedding.vectors, BATCH_SIZE, maxLength, dropout_p=0.1).to(device) decoder1.load_state_dict(torch.load(decoderName)) encoder1 = encoder1.to(device) encoder1.eval() decoder1 = decoder1.to(device) decoder1.eval() with open(testDataName, "rb") as FileTesting: #print(sys.argv[1]) testingData = pickle.load(FileTesting) testingData = Seq2SeqDataset(testingData) def pad_to_len(seqs, to_len, padding=0): paddeds = [] for seq in seqs: paddeds.append(seq[:to_len] + [padding] * max(0, to_len - len(seq))) return paddeds def attention_collate_fn(samples): batch = {} for key in ['id', 'len_text', 'len_summary']: batch[key] = [sample[key] for sample in samples] for key in ['text', 'summary', 'attention_mask']:
modelName = sys.argv[4] with open(trainingName, "rb") as FileTraining: #print(sys.argv[1]) trainingData = pickle.load(FileTraining) with open(validName, "rb") as FileValidating: #print(sys.argv[1]) validData = pickle.load(FileValidating) """ with open("data/valid.jsonl","r") as f: answers = [json.loads(line) for line in f] answers = {a['id']: a for a in answers} """ trainingData = Seq2SeqDataset(trainingData) validData = Seq2SeqDataset(validData) with open(embeddingName, 'rb') as f: embedding = pickle.load(f) with open('../datasets/seq2seq/config.json', 'r') as f: config = json.load(f) maxTextLen = config.get('max_text_len') maxSummaryLen = config.get('max_summary_len') #print(maxTextLen, maxSummaryLen) maxLength = max(maxTextLen, maxSummaryLen) encoder = AttnEncoderRNN(len(embedding.vocab), hidden_size, embedding.vectors, BATCH_SIZE).to(device)
help='Whether to enable reconstruction model or not', action='store_true') arg.add_argument('--evaluate', help='Evaluate the model using the pretrained model', action='store_true') args = arg.parse_args() evaluate = True if args.evaluate else False copy = True if args.copy else False recons = True if args.recons else False print('------------ Loading Datasets ------------\n') train_descs, train_slogans, valid_descs, valid_slogans, test_descs, test_slogans = load_csv( args.dataset_path) train_data = Seq2SeqDataset(train_descs, train_slogans, (SRC, TRG)) test_data = Seq2SeqDataset(test_descs, test_slogans, (SRC, TRG)) valid_data = Seq2SeqDataset(valid_descs, valid_slogans, (SRC, TRG)) print('------------ Building Vocab ------------\n') SRC.build_vocab(train_data, max_size=args.vocab_size) TRG.build_vocab(train_data, max_size=args.vocab_size) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=args.bs, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device)
def train(args): logfile = logging.FileHandler(args.save_dir + '/log.txt', mode='w') logfile.setFormatter(fmt) logger.addHandler(logfile) model_class, tokenizer_class = register(args.pretrained_model_path) train_dataset = Seq2SeqDataset( tokenizer_class=tokenizer_class, tokenizer_path=args.pretrained_model_path, source_data_path=args.train_source_data_path, target_data_path=args.train_target_data_path, indivisible_tokens_path=args.indivisible_tokens_path, cache_dir=args.cache_dir, save_tokenizer=args.save_dir ) train_dataloader = train_dataset.get_dataloader(batch_size=args.batch_size, shuffle=True) valid_dataset = Seq2SeqDataset( tokenizer_class=tokenizer_class, tokenizer_path=args.save_dir, source_data_path=args.valid_source_data_path, target_data_path=args.valid_target_data_path ) valid_dataloader = valid_dataset.get_dataloader(batch_size=args.valid_batch_size, shuffle=False) model = model_class.from_pretrained(args.pretrained_model_path, cache_dir=args.cache_dir) if args.indivisible_tokens_path is not None: model.resize_token_embeddings(len(train_dataset.tokenizer)) model.to(DEVICE) model.train() logger.info(f'model\n{model}') num_total_params = sum(p.numel() for p in model.parameters()) logger.info(f'total parameters: {num_total_params}') optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) optimizer.zero_grad() logger.info(f'optimizer\n{optimizer}') if not args.debug: train_num_batchs_per_epoch = math.ceil(len(train_dataset) / args.batch_size) train_progress_widgets = [ progressbar.Percentage(), ' | ', progressbar.SimpleProgress(), ' | ', progressbar.Variable('step', width=0), ' | ', progressbar.Variable('loss', width=0, precision=6), ' ', progressbar.Bar('▇'), ' ', progressbar.Timer(), ' | ', progressbar.ETA() ] valid_num_batchs_per_epoch = math.ceil(len(valid_dataset) / args.valid_batch_size) valid_progress_widgets = [ progressbar.Percentage(), ' | ', progressbar.SimpleProgress(), ' ', progressbar.Bar('▇'), ' ', progressbar.Timer(), ' | ', progressbar.ETA() ] global_step = 1 best_valid_measure = math.inf best_epoch_itr = 0 for epoch_itr in range(args.max_epoch): train_epoch_sum_loss = 0 train_epoch_average_loss = 0 logger.info(f'begin training epoch {epoch_itr+1}') if not args.debug: train_progress = progressbar.ProgressBar( max_value=train_num_batchs_per_epoch, widgets=train_progress_widgets, redirect_stdout=True ).start() for itr, data in enumerate(train_dataloader): src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data) labels = shift_target_inputs_to_labels(tgt_input_ids, train_dataset.tokenizer.pad_token_id) output = model( input_ids=src_input_ids, attention_mask=src_attn_mask, decoder_input_ids=tgt_input_ids, decoder_attention_mask=tgt_attn_mask, labels=labels ) loss = output[0] train_epoch_sum_loss += loss * src_input_ids.shape[0] normalized_loss = loss / args.update_frequency normalized_loss.backward() global_step += 1 if not args.debug: train_progress.update(itr+1, step=global_step, loss=loss) if (itr + 1) % args.update_frequency == 0: optimizer.step() optimizer.zero_grad() if not args.debug: train_progress.finish() train_epoch_average_loss = train_epoch_sum_loss.item() / len(train_dataset) logger.info(f'average training loss: {train_epoch_average_loss}') logger.info(f'begin validation for epoch {epoch_itr+1}') model.eval() if not args.debug: valid_progress = progressbar.ProgressBar( max_value=valid_num_batchs_per_epoch, widgets=valid_progress_widgets, redirect_stdout=True ).start() valid_measure = 0 if args.valid_bleu: hypotheses = [] references = [] else: valid_epoch_sum_loss = 0 for itr, data in enumerate(valid_dataloader): src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data) if args.valid_bleu: with torch.no_grad(): tgt_output_ids = model.generate( src_input_ids, attention_mask=src_attn_mask, num_beams=args.valid_beam_size, max_length=args.valid_max_length ) for seq_ids in tgt_output_ids.to('cpu').numpy().tolist(): seq_toks = valid_dataset.tokenizer.decode( seq_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False ) hypotheses.append(seq_toks) for seq_ids in tgt_input_ids.to('cpu').numpy().tolist(): seq_toks = valid_dataset.tokenizer.decode( seq_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False ) references.append(seq_toks) else: labels = shift_target_inputs_to_labels(tgt_input_ids, valid_dataset.tokenizer.pad_token_id) with torch.no_grad(): output = model( input_ids=src_input_ids, attention_mask=src_attn_mask, decoder_input_ids=tgt_input_ids, decoder_attention_mask=tgt_attn_mask, labels=labels ) valid_loss = output[0] valid_epoch_sum_loss += valid_loss * src_input_ids.shape[0] if not args.debug: valid_progress.update(itr+1) model.train() if not args.debug: valid_progress.finish() if args.valid_bleu: bleu = sacrebleu.corpus_bleu(hypotheses, [references], force=True) valid_measure = -bleu.score logger.info(f'validation BLEU: {bleu.score}') else: valid_measure = valid_epoch_sum_loss.item() / len(valid_dataset) logger.info(f'validation loss: {valid_measure}') if valid_measure < best_valid_measure: logger.info('saving new best checkpoints') best_valid_measure = valid_measure best_epoch_itr = epoch_itr + 1 model.save_pretrained(args.save_dir) if (epoch_itr + 1 - best_epoch_itr) > args.patience: logger.info(f'early stop since valid performance hasn\'t improved for last {args.patience} eopchs') break
sentense_ints = vocab_to_int["<UNK>"] else: sentense_ints = vocab_to_int["<UNK>"] # print(word) # if eos: # sentense_ints.append(vocab_to_int["<EOS>"]) ints.append(sentense_ints) ints.append(vocab_to_int["<EOS>"]) return ints # seq2seq_preprocess(data_path_text,data_path_motion) print("preprocess finished") extro_data_test_path = 'head.npz' word2idx = dic test_set = Seq2SeqDataset(extro_data_test_path, word2idx) test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=seq2seq_collate_fn) #%% # buliding model dof_num = 4 embed_dim = 100 learning_rate = 1e-3 encoder_hidden_dim = 32 decoder_hidden_dim = 32 model_save_folder = './saved_models/'
def predict(self, test_path, test_cleaned_path, out_path): # read raw data to list lines_id = [] lines_raw = [] lines_cleaned = [] lines_prep = [] with open(test_path, 'r') as f, open(test_cleaned_path, 'r') as fc: for line in f: line_id = line[:3] line_seq = line[4:] lines_id.append(line_id) lines_raw.append(line_seq) lines_prep.append(self.preprocess(line_seq)) for line in fc: lines_cleaned.append(line[4:]) # prepare dataset print("Reading test data...") test = Seq2SeqDataset.from_list(lines_prep) test.src_field.vocab = self.src_vocab # prepare iterator test_iterator = BucketIterator(dataset=test, batch_size=1, train=False, sort=False, sort_within_batch=False, shuffle=False, device=device) # predict with open(out_path, 'w') as writer: with torch.no_grad(): for i, batch in enumerate(test_iterator): # forward through model _, _, output = self.model(batch, has_targets=False, mask_softmax=1.0, teacher_forcing=1.0) print(output.shape) # get top-1 predicted_values, predicted_indices = torch.max(output, dim=-1) print(predicted_values.shape) print(predicted_indices.shape) # convert predicted vocab indices to an actual sentence predicted_seq = [ self.tgt_vocab.itos[c] for c in predicted_indices.squeeze(0).tolist() ] # print('predicted_seq') # print(predicted_seq) # output is log_softmax so do exp() predicted_values = predicted_values.exp() # print('predicted_values') # print(predicted_values) # convert to list predicted_values_ = predicted_values.squeeze(0).tolist() # beam search predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]), predicted_values_[1:-1], lines_raw[i]) # match case and punctuations predicted_seq = self.match_case(predicted_seq, lines_raw[i]) # do some post-processing to match submission output predicted_seq = self.match_output(predicted_seq, lines_cleaned[i]) print("{} {}".format(i, predicted_seq)) # write to file with line_id writer.write(lines_id[i] + ',' + predicted_seq + '\n')
def train(args): if args.logdir is None: args.logdir = "Models-{}".format(time.strftime("%Y%m%d-%H%M%S")) task = "lm" if not args.seq2seq else "seq2seq" args.logdir += "_" + args.model_type + "_" + task os.makedirs(args.logdir, exist_ok=True) os.makedirs(os.path.join(args.logdir, "models"), exist_ok=True) print("Experiment dir : {}".format(args.logdir)) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(args.logdir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) device = "cuda:" + str(args.gpuid) if torch.cuda.is_available() else "cpu" mem_crammer = [] if args.model_type == "lstm": from lstm import LMModel, Seq2SeqModel elif args.model_type == "transformer": from transformer import LMModel, Seq2SeqModel if args.seq2seq: train_set = Seq2SeqDataset(device=device) valid_set = Seq2SeqDataset(split="valid", device=device) model = Seq2SeqModel(args, train_set.dictionary).to(device) else: train_set = LMDataset(device=device) valid_set = LMDataset(split="valid", device=device) model = LMModel(args, train_set.dictionary).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) warmup_epoch = args.num_epoch * 0.1 scheduler = ExponentialLR(optimizer, 0.1**(1 / (args.num_epoch - warmup_epoch))) iter_per_epoch = (len(train_set) + args.batch_size - 1) // args.batch_size warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch) train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True) bestppl = 1e9 for epoch in range(args.num_epoch): model.train() if args.cram: while True: try: junk = torch.rand((9999, 9999), dtype=float, device=device) except: with torch.cuda.device(device): torch.cuda.empty_cache() break mem_crammer.append(junk) with tqdm(train_loader, desc="training") as pbar: losses = [] for samples in pbar: if epoch < warmup_epoch: warmup_scheduler.step() optimizer.zero_grad() while True: success = True try: loss = model.get_loss(**samples) loss.backward() optimizer.step() except: del mem_crammer[-1] with torch.cuda.device(device): torch.cuda.empty_cache() success = False optimizer.zero_grad() if success: break losses.append(loss.item()) pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) logging.info( "Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr'])) if epoch % args.save_interval == 0: savepath = os.path.join( args.logdir, "models/{}_{}.pt".format(args.model_type, epoch + 1)) torch.save(model, savepath) logging.info("Saving to {}".format(savepath)) if task == "lm": print("好 -->", model.generate("好", beam_size=3, device=device)) print("秋水 -->", model.generate("秋水", beam_size=3, device=device)) print("寒烟翠-->", model.generate("寒烟翠", beam_size=3, device=device)) elif task == "seq2seq": print("改革春风吹满地-->", model.generate("改革春风吹满地", beam_size=2, device=device)) print("牛津大学聪明人不及蟾蜍一半-->", model.generate("牛津大学聪明人不及蟾蜍一半", beam_size=2, device=device)) print("一支穿云箭,青天白日重新现-->", model.generate("一支穿云箭,青天白日重新现", beam_size=2, device=device)) loss, ppl = evaluate(model, valid_set, False) logging.info("Valid, Loss: %0.8f, ppl: %0.8f" % (loss, ppl)) if ppl < bestppl: bestppl = ppl savepath = os.path.join( args.logdir, "models/{}_{}.pt".format(args.model_type, task)) torch.save(model, savepath) logging.info("Best ppl! Saving to {}".format(savepath)) if epoch >= warmup_epoch: scheduler.step()
# coding=utf-8 from tool.config import loadConfig from model import Seq2SeqModel from dataset import Seq2SeqDataset args = loadConfig('config.ini') dataset = Seq2SeqDataset(args) model = Seq2SeqModel(args) if args.mode == 'train': print('trainging') train_set = dataset.getDatas('train') eval_set = dataset.getDatas('eval') model.train(train_set, eval_set) elif args.mode == 'eval': print('evaluation') eval_set = dataset.getDatas('eval') model.eval(eval_set) elif args.mode == 'predict': print('prediction') eval_set = dataset.getDatas('eval') print(dataset.ftk.convert_ids_to_tokens(eval_set[0][0])) pred_id = model.predict(eval_set[0][0], eval_set[2][0]) print(dataset.ftk.convert_ids_to_tokens(pred_id)) elif args.mode == 'freeze': print('evaluation') model.freeze() elif args.mode == 'infer': print('infer') eval_set = dataset.getDatas('eval')
intro_data_train_path = './data/intro_seq2seq_dataset_train.npz' intro_data_valid_path = './data/intro_seq2seq_dataset_valid.npz' intro_data_test_path = './data/intro_seq2seq_dataset_test.npz' natural_data_train_path = './data/natural_seq2seq_dataset_train.npz' natural_data_valid_path = './data/natural_seq2seq_dataset_valid.npz' natural_data_test_path = './data/natural_seq2seq_dataset_test.npz' # data_path = './data/extro_seq2seq_dataset.npz' dic = json.load(open("vocab_to_int.txt")) word2idx = dic # load word map # torch.cuda.empty_cache() train_set = Seq2SeqDataset(extro_data_train_path, word2idx) valid_set = Seq2SeqDataset(extro_data_valid_path, word2idx) test_set = Seq2SeqDataset(extro_data_test_path, word2idx) # training_set = EmotionDataLoaderStart(X_train, y_train, tag_train, pad_len, word2id) train_loader = DataLoader(train_set, batch_size, shuffle=True, collate_fn=seq2seq_collate_fn) valid_loader = DataLoader(valid_set, batch_size, shuffle=True, collate_fn=seq2seq_collate_fn) test_dataloader = DataLoader(test_set,
# test_data_path = './train_valid_test_data/'+ personality +'_seq2seq_dataset_test.npz' extro_data_train_sample_path = './data/extro_seq2seq_dataset_train_sample.npz' extro_data_test_path = './data/extro_seq2seq_dataset_test.npz' intro_data_test_path = './data/intro_seq2seq_dataset_test.npz' natural_data_test_path = './data/natural_seq2seq_dataset_test.npz' dic = json.load(open("vocab_to_int.txt")) word2idx = dic # load word map idx2word = {v: k for k, v in word2idx.items()} train_set = Seq2SeqDataset(extro_data_train_sample_path, word2idx) test_set = Seq2SeqDataset(extro_data_test_path, word2idx) test_dataloader = DataLoader(test_set, batch_size=1, collate_fn=seq2seq_collate_fn) #%% # buliding model dof_num = 4 embed_dim = 100 learning_rate = 1e-3 encoder_hidden_dim = 32 decoder_hidden_dim = 32