def transformer_predict(input_file: str, text_encoder: TextEncoder, device: int): if device > -1: device_name = "cuda" else: device_name = "cpu" print(input_file) n_ctx = 512 transformer = TransformerModel(DEFAULT_CONFIG, n_ctx=n_ctx, requires_grad=False) load_openai_pretrained_model(transformer, n_ctx=n_ctx) with open(input_file) as f: sentences = f.readlines() encoded_sentences = text_encoder.encode(sentences) masks = [ np.concatenate((np.ones(len(s)), np.zeros(n_ctx - len(s)))) for s in encoded_sentences ] input_tensor = torch.LongTensor([ pad_sequence_to_length(s, desired_length=512) for s in encoded_sentences ]) if device_name == "cuda": input_tensor = input_tensor.cuda() batch_size, num_timesteps = input_tensor.size() positional_encodings = get_range_vector(num_timesteps, device) + n_ctx batch_tensor = torch.stack( [input_tensor, positional_encodings.expand(batch_size, num_timesteps)], dim=-1) if device_name == "cuda": transformer = transformer.cuda() transformer_embeddings = transformer(batch_tensor) np.save("openai_transformer_test_input.npy", batch_tensor.data.cpu().numpy()) np.save("openai_transformer_test_output.npy", transformer_embeddings.data.cpu().numpy())
def load_openai_gpt(n_special=1, n_ctx=512): text_encoder = TextEncoder("pytorch-openai-transformer-lm/model/encoder_bpe_40000.json", "pytorch-openai-transformer-lm/model/vocab_40000.bpe") encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) vocab = n_vocab + n_special + n_ctx args = DEFAULT_CONFIG lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special, path="pytorch-openai-transformer-lm/model/", path_names="pytorch-openai-transformer-lm/") # lm_model.to(device) lm_model.return_probs = False lm_model.eval() return lm_model, text_encoder
def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])
model_stepwise = StepwiseClassifierModel(args, n_classifier=args.n_classes, vocab_count=args.vocab_count, extra_block=args.extra_block) model_opt = OpenAIAdam(model_stepwise.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, ector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) epoch_start, epoch_max, loss_best = -1, args.n_epoch, None if args.checkpoint is None: load_openai_pretrained_model( model_stepwise.transformer, n_special=args.tokens_special, n_ctx=n_ctx, # n_ctx adjusts embedding size to include positional path=pretrained_model_path+'/', path_names=os.path.join('.', 'orig', 'pytorch-openai-transformer-lm')+'/', ) model_stepwise.to(device) if torch.cuda.device_count() > 1: # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html print("Let's use", torch.cuda.device_count(), "GPUs!") model_stepwise = nn.DataParallel(model_stepwise) os.makedirs('./checkpoints', exist_ok=True) if args.checkpoint is not None: checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) epoch_start = checkpoint['epoch']
model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') torch.save(dh_model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): print("running epoch", i) run_epoch()
model_opt = OpenAIAdam(list(model.parameters()) + list(clf_head.parameters()) + list(lm_head.parameters()), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = LossCompute(criterion, criterion, args.lm_coef, model_opt) load_openai_pretrained_model(model, n_ctx=n_ctx, n_special=n_special) model.to(device) lm_head.to(device) clf_head.to(device) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') torch.save(model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): print("running epoch", i)
criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(lm_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = LMLossCompute(criterion, model_opt) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special, n_vocab=n_vocab) lm_model.to(device) lm_model = nn.DataParallel(lm_model) n_updates = 0 n_epochs = 0 if submit: path = os.path.join(save_dir, desc, 'best_params') print(path) torch.save(lm_model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): print("running epoch", i) run_epoch() n_epochs += 1
dh_model.to(device) dh_model = nn.DataParallel(dh_model) print("Loading snapshot...") snapshot_dict = torch.load( os.path.join(args.snapshot_dir, 'best_params')) if args.snapshot_mode == 'transformer_only': model_dict = dh_model.state_dict() model_dict.update({ k: v for k, v in snapshot_dict.items() if 'task_head' not in k }) snapshot_dict = model_dict dh_model.load_state_dict(snapshot_dict) else: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, n_transfer=args.n_transfer) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup,
def main(args): init(args) # Constants n_ctx = args.n_ctx data_dir = args.data_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) text_encoder.decoder[len(encoder)] = '_start_' encoder['_start_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_delimiter_' encoder['_delimiter_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_classify_' encoder['_classify_'] = len(encoder) n_special = 3 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx lm_model = LMModel(args, vocab, n_ctx, return_probs=True, doc_embed=args.doc_model) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask lm_model.load_state_dict(state_dict) lm_model.to(device) lm_model = DataParallelModel(lm_model) train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=1, shuffle=True, max_size=args.n_iter) srcs, hyps, refs = [], [], [] with torch.no_grad(): lm_model.eval() for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1): src_strs, tgt_strs, gen_strs = generate_outputs( lm_model, pad_output, mask_output, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy) srcs.extend(src_strs) hyps.extend(gen_strs) refs.extend(tgt_strs) for i in range(len(hyps)): print("*" * 50) print("Source: {}".format(srcs[i])) print('Hypothesis: {}'.format(hyps[i])) print("Reference: {}".format(refs[i]))
def main(args): # Constants n_ctx = args.n_ctx desc = args.desc device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") test_loader = get_loader(args.data_file, args.n_batch, encoder, num_workers=1, shuffle=False, subset=args.subset) vocab = n_vocab + n_special + n_ctx dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) print("Loading model...") load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask dh_model.load_state_dict(state_dict) dh_model.to(device) dh_model = DataParallelModel(dh_model) stop_words = [] if args.stop_words is not None: with open(args.stop_words) as f: for line in f: stop_words.append(line) evaluate_model(dh_model, test_loader, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy, args.save_file, args.gen_dir, args.tgt_dir, args.max_len, stop_words, args)
def main(args): init(args) # Constants n_ctx = args.n_ctx save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) text_encoder = TextEncoder(args.encoder_path, args.vocab_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True) val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples) print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader))) vocab = n_vocab + n_special + n_ctx n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft) dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) lm_loss = LMLoss(criterion) summary_loss = SummaryLoss(criterion) print("Loading Model") if args.use_pretrain: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx) dh_model.to(device) dh_model = DataParallelModel(dh_model) lm_loss = DataParallelCriterion(lm_loss) summary_loss = DataParallelCriterion(summary_loss) for i in range(args.num_epochs_dat): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss) for i in range(args.num_epochs_ft): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)