def main(args): # with open(args.output_dir / 'config.json') as f: # config = json.load(f) # loading datasets from jsonl files # with open(config['train']) as f: # train = [json.loads(line) for line in f] with open(args.valid_data_path) as f: valid = [json.loads(valid) for valid in f] # with open(config['test']) as f: # test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open('embedding2.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'valid_seq2seq.pkl', tokenizer.pad_token_id)
def main(args): # loading datasets from jsonl files with open(args.input_data_path) as f: valid = [json.loads(valid) for valid in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') """ embedding = Embedding("./glove.6B.300d.txt", words=words) with open('./embedding.pkl', 'wb') as f: pickle.dump(embedding, f) """ with open('./embedding.pkl', 'rb') as file: embedding = pickle.load(file) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl', tokenizer.pad_token_id)
def main(args): with open(args.test_input) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open(args.embedding_file, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.test_output, tokenizer.pad_token_id )
def main(path): with open(path) as f: test = [json.loads(line) for line in f] with open("./datasets/seq_tag/embedding.pkl", "rb") as f: embedding = pickle.load(f) tokenizer = Tokenizer(embedding.vocab, lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test), './datasets/seq_tag/test.pkl')
def main(args): with open(args.output_dir / 'config.json') as f: config = json.load(f) # loading datasets from jsonl files with open(config['train']) as f: train = [json.loads(line) for line in f] with open(config['valid']) as f: valid = [json.loads(valid) for valid in f] with open(config['test']) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in train] + [sample['summary'] for sample in train] + [sample['text'] for sample in valid] + [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=config['lower_case']) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') embedding = Embedding(config['embedding'], words=words) with open(args.output_dir / 'embedding.pkl', 'wb') as f: pickle.dump(embedding, f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating train dataset...') create_seq2seq_dataset( process_samples(tokenizer, train), args.output_dir / 'train.pkl', config, tokenizer.pad_token_id ) logging.info('Creating valid dataset...') create_seq2seq_dataset( process_samples(tokenizer, valid), args.output_dir / 'valid.pkl', config, tokenizer.pad_token_id ) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.output_dir / 'test.pkl', config, tokenizer.pad_token_id )
def main(args): # Read test file with open(args.input_dataname) as f: test = [json.loads(line) for line in f] # Read embedding with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test), args.output_dir / 'test_tag.pkl', tokenizer.pad_token_id)
def eval(args): batch_size = 32 train_on_gpu = torch.cuda.is_available() enc = RNNEncoder(300, args.embedding_file) dec = RNNDecoder(300, args.embedding_file) device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') model = Seq2Seq(enc, dec, device).to(device) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) model.eval() embedding_matrix = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding_matrix.vocab) eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) output_file = open(args.output_path, 'w') val_losses = [] prediction = {} for batch in tqdm(eval_loader): pred = model(batch, 0) pred = torch.argmax(pred, dim=2) # batch, seq_len for i in range(len(pred)): prediction[batch['id'][i]] = tokenizer.decode( pred[i]).split('</s>')[0].split(' ', 1)[1] pred_output = [ json.dumps({ 'id': key, 'predict': value }) for key, value in sorted(prediction.items(), key=lambda item: item[0]) ] output_file.write('\n'.join(pred_output)) output_file.write('\n') output_file.close()
def main(args): with open(args.output_dir / 'config.json', 'r') as f: config = json.load(f) with open(args.input_data) as f: test = [json.loads(line) for line in f] with open(os.path.join(args.output_dir, "embedding.pkl"), 'rb') as f: embedding = pickle.load(f) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset(process_samples(tokenizer, test), args.output_dir / 'test_seq.pkl', config, tokenizer.pad_token_id)
def main(argv): with open(CONFIG, 'r') as f: config = json.load(f) # loading datasets from jsonl files testName = argv[1] with open(testName, 'r') as f: test = [json.loads(line) for line in f] tokenizer = Tokenizer(lower=config['lower_case']) logging.info('Loading embedding...') with open(ENBEDDINT_NAME, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset(process_samples(tokenizer, test), 'testSeq2Seq.pkl', config, tokenizer.pad_token_id)
def eval(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #print(device) BATCH_SIZE = 32 ENC_HID_DIM = 128 DEC_HID_DIM = 128 N_LAYERS = 1 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 PADDING_INDEX = 0 embedding = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) embedding_matrix = embedding.vectors.to(device) output_dim = len(embedding.vectors) embedding_dim = 300 attn = Attention(ENC_HID_DIM, DEC_HID_DIM) encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, ENC_DROPOUT) decoder = Decoder(output_dim, embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, DEC_DROPOUT, attn) model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device) optimizer = torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) model.eval() eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) output_file = open(args.output_path, 'w') val_losses = [] prediction={} for batch in tqdm(eval_loader): #print(text.size()) pred ,attention= model(batch, 0) #print(pred.size()) pred = torch.argmax(pred, dim=2) #print(pred.size()) pred = pred.permute(1, 0) #print(pred.size()) for i in range(len(pred)): prediction[batch['id'][i]] = tokenizer.decode(pred[i]).split('</s>')[0].split(' ',1)[1] pred_output = [json.dumps({'id':key, 'predict': value}) for key, value in sorted(prediction.items(), key=lambda item: item[0])] output_file.write('\n'.join(pred_output)) output_file.write('\n') output_file.close()
def main(): TRAIN = 'datasets/seq2seq/train.pkl' train = pickle.load(open(TRAIN, 'rb')) device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') batch_size = 32 train_loader = DataLoader(train, batch_size=batch_size, num_workers=0, shuffle=False, collate_fn=train.collate_fn) embedding_matrix = pickle.load(open("datasets/seq2seq/embedding.pkl", 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding_matrix.vocab) encoder = RNNEncoder(300, "datasets/seq2seq/embedding.pkl") decoder = RNNDecoder(300, "datasets/seq2seq/embedding.pkl") model = Seq2Seq(encoder, decoder, device).to(device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss(ignore_index=0) n_epochs = 6 print_every = 2238 counter = 0 valid_loss_min = np.Inf model.train() for epoch in range(1, n_epochs + 1): logging.info('Training') train_losses = [] loss = 0 counter = 0 for batch in tqdm(train_loader): counter += 1 #print("begining") #print(batch['text'].size()) #[batch txt_len] #print(batch['summary'].size()) #[batch trg_len] #print(len(batch['padding_len'])) #[batch] optimizer.zero_grad() output = model(batch) #print("model output") #print(output.size()) #[batch, trg_len, embedding word] output_dim = output.shape[-1] #print(output[:,0,:]) output = output[:, 1:, :].reshape(-1, output_dim) #print(output.size()) #[batch*(trg_len -1), embedding word] target = batch['summary'][:, 1:].reshape(-1).to(device) #print(target.size()) #[batch*(trg_len-1)] loss = criterion(output, target) train_losses.append(loss.item()) loss.backward() optimizer.step() checkpoint_path = f'src/model_state/seq2seq/ckpt.{epoch}.pt' torch.save({ 'state_dict': model.state_dict(), 'epoch': epoch, }, checkpoint_path) print("Epoch: {}/{}...".format(epoch, n_epochs), "Loss: {:.6f}...".format(np.mean(train_losses)))
config = json.load(f) tokenizer = Tokenizer(lower=config['lower_case']) #print(tokenizer) solver = Solver(tokenizer=tokenizer) arg = sys.argv solver = Solver() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if arg[1] == "train": #python src/main.py train batch_size with open("datasets/seq2seq/train.pkl", 'rb') as f: train = pickle.load(f) with open("datasets/seq2seq/valid.pkl", 'rb') as f: valid = pickle.load(f) with open("datasets/seq2seq/embedding.pkl", 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) solver.tokenizer = tokenizer batch_size = int(arg[2]) t_l = len(train) if t_l%batch_size==0: t_bl = t_l//batch_size else: t_bl = t_l//batch_size+1 v_l = len(valid) if v_l%batch_size==0: v_bl = v_l//batch_size else: v_bl = v_l//batch_size+1 train_batches = [train.collate_fn([train[j] for j in range(i*batch_size,min((i+1)*batch_size,t_l))]) for i in range(t_bl)] print(train_batches[0]['summary'][0]) #print(batches['text'][0:batch_size])
def main(args): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #print(device) BATCH_SIZE = 32 ENC_HID_DIM = 128 DEC_HID_DIM = 128 N_LAYERS = 1 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 PADDING_INDEX = 0 embedding = pickle.load(open(args.embedding_file, 'rb')) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) embedding_matrix = embedding.vectors.to(device) output_dim = len(embedding.vectors) embedding_dim = 300 attn = Attention(ENC_HID_DIM, DEC_HID_DIM) encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, ENC_DROPOUT) decoder = Decoder(output_dim, embedding_dim, ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, DEC_DROPOUT, attn) model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device) optimizer = torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX) ckpt = torch.load(args.model_path) model.load_state_dict(ckpt['state_dict']) eval_data = pickle.load(open(args.test_data_path, 'rb')) eval_loader = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn) val_losses = [] prediction = {} for batch in tqdm(eval_loader): text = batch['text'].to(device) text_len = batch['padding_len'] truth = batch["summary"].to(device) text = text.permute(1, 0) truth = truth.permute(1, 0) #print(text.size()) pred, attn = model(text, text_len, truth, 0) #print(pred.size()) pred = torch.argmax(pred, dim=2) #print(pred.size()) pred = pred.permute(1, 0) #print(pred.size()) break text = text.permute(1, 0) attn = attn.permute(1, 0, 2) for i in range(len(text[-1])): if text[-1][i] == 0: text_stop = i break for i in range(len(pred[-1])): if pred[-1][i] == 2: pred_stop = i break input = text[-1][0:text_stop] attention = attn[-1][1:pred_stop + 1, 0:text_stop] output = pred[-1][1:pred_stop + 1] showAttention([embedding.vocab[t] for t in input], [embedding.vocab[t] for t in output], attention)