def load_model(save_dir, model_file='model.pt', text_encoder_file='text_encoder.pkl', label_encoder_file='label_encoder.pkl'): model = DoubleHeadModel.load_from_file(join(save_dir, model_file)) with open(join(save_dir, text_encoder_file), 'rb') as f: text_encoder = pickle.load(f) with open(join(save_dir, label_encoder_file), 'rb') as f: label_encoder = pickle.load(f) return model, text_encoder, label_encoder
for x in vaX] + [len(x[:max_len]) for x in teX]) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx training_engine = TrainingEngine() trX, trM = training_engine.transform_veracity(trX) vaX, vaM = training_engine.transform_veracity(vaX) if submit: teX, teM = training_engine.transform_veracity(teX) n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, ('classification', 3), vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduction='none') model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt)
] + [ len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3) ]) + 3, n_ctx) vocab = n_vocab + n_special + n_ctx trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt)
start_token = n_vocab clf_token = n_vocab + 1 n_special = 2 max_len = 140 n_ctx = max_len + 2 vocab = n_vocab + n_special + n_ctx trX, trM = transform_tweet(trX1) n_train = len(trY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter print("updates total", n_updates_total) dh_model = DoubleHeadModel(args, clf_token, ('classification', 2), vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion,
attn_pdrop=args.attn_pdrop, resid_pdrop=args.resid_pdrop, afn=args.afn, clf_pdrop=args.clf_pdrop, skip_connections=args.skip_connections, )), clf_token=clf_token, task_head_type=['classification', n_class], vocab=vocab, n_ctx=n_ctx, ), encoder=dict(max_len=max_len, ), ) print(meta) dh_model = DoubleHeadModel(**meta['dh_model']) if args.snapshot_dir is not None: dh_model.to(device) dh_model = nn.DataParallel(dh_model) print("Loading snapshot...") snapshot_dict = torch.load( os.path.join(args.snapshot_dir, 'best_params')) if args.snapshot_mode == 'transformer_only': model_dict = dh_model.state_dict() model_dict.update({ k: v for k, v in snapshot_dict.items() if 'task_head' not in k }) snapshot_dict = model_dict dh_model.load_state_dict(snapshot_dict) else:
seq = sorted(seq, key=lambda x: len(x)) #Setup Model encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 n_ctx = np.array([len(t) for t in seq]).max() + 2 n_ctx = int(n_ctx) print(n_ctx) vocab = int(n_vocab + n_special + n_ctx) dh_model = DoubleHeadModel(args, clf_token, ('classification', 1), vocab, n_ctx) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) if GPU: dh_model = dh_model.cuda() #.half() if half: dh_model = dh_model.half() #dh_model = dh_model.type(torch.cuda.HalfTensor) #for layer in dh_model.modules(): # if isinstance(layer, LayerNorm): # layer.float()
x1 = dev_queries[i] for j in range(10): x2 = dev_passages[j][i] tmp_length.append(len(x1[:q_max_len]) + len(x2[:p_max_len])) t.append(max(tmp_length)) n_ctx = min(max(t) + 3, n_ctx) print('n_ctx is: ', n_ctx) vocab = n_vocab + n_special + n_ctx n_train = len(train_queries) n_valid = len(dev_queries) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, 'msmarco_para_select', vocab, n_ctx) criterion_lm = nn.CrossEntropyLoss(reduce=False) criterion_clf = nn.KLDivLoss() model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) load_openai_pretrained_model(dh_model.transformer,
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', required=True) parser.add_argument('-o', '--output_file', required=True) parser.add_argument('--n_batch', type=int, default=8) parser.add_argument('--skip_preprocess', action='store_true') parser.add_argument('--sentence_pair', action='store_true') parser.add_argument('--force_delimiter', action='store_true') parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json') parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe') parser.add_argument('--model_dir', required=True) parser.add_argument('--mc_dropout_iter', type=int, default=0) args = parser.parse_args() meta = json.load(open(os.path.join(args.model_dir, 'meta.json'), 'r', encoding='utf8')) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) if args.sentence_pair or args.force_delimiter: encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 + int('_delimiter_' in encoder) n_ctx = meta['dh_model']['n_ctx'] max_len = meta['encoder']['max_len'] if args.sentence_pair: max_len = min(max_len, n_ctx // 2 - 2) texts, labels = load_headerless_tsv(args.input_file, sentence_pair=args.sentence_pair) ((X, Y),) = encode_dataset(*[(texts, labels)], encoder=text_encoder, skip_preprocess=args.skip_preprocess) X, M = transform_classification(X, max_len, encoder['_start_'], clf_token, n_vocab, n_special, n_ctx, encoder.get('_delimiter_')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() n_batch_train = args.n_batch * max(n_gpu, 1) meta['dh_model']['cfg'] = dotdict(meta['dh_model']['cfg']) dh_model = DoubleHeadModel(**meta['dh_model']) dh_model.to(device) dh_model = torch.nn.DataParallel(dh_model) path = os.path.join(args.model_dir, 'best_params') if device == torch.device('cpu'): map_location = lambda storage, loc: storage else: map_location = None dh_model.load_state_dict(torch.load(path, map_location=map_location)) prediction_output = predict(X=X, submission_dir=None, filename=None, pred_fn=lambda x: x, label_decoder=None, dh_model=dh_model, n_batch_train=n_batch_train, device=device) predictions = np.argmax(prediction_output, axis=1) if type(texts) is tuple: df = pd.DataFrame({'question': texts[0], 'text': texts[1], 'label': labels, 'prediction': predictions}) else: df = pd.DataFrame({'text': texts, 'label': labels, 'prediction': predictions}) df.to_csv(args.output_file, index=False, sep='\t', header=False, columns=['text', 'label', 'prediction'], float_format='%.0f') accuracy = accuracy_score(Y, predictions) * 100. print('Accuracy: {}%'.format(accuracy)) basename = os.path.splitext(args.output_file)[0] prediction_output_file = basename + '_output.npy' np.savetxt(prediction_output_file, prediction_output) prediction_probs = np_softmax(prediction_output) prediction_probs_file = basename + '_probs.npy' np.savetxt(prediction_probs_file, prediction_probs) mc_dropout_prediction_output = [] for _ in tqdm(range(args.mc_dropout_iter)): prediction_output = predict(X=X, submission_dir=None, filename=None, pred_fn=lambda x: x, label_decoder=None, dh_model=dh_model, n_batch_train=n_batch_train, device=device, enable_dropout=True) mc_dropout_prediction_output.append(prediction_output) if mc_dropout_prediction_output: mc_dropout_prediction_output = np.asarray(mc_dropout_prediction_output) mc_dropout_prediction_probs = np.zeros(mc_dropout_prediction_output.shape) for i in range(mc_dropout_prediction_output.shape[0]): mc_dropout_prediction_probs[i, ...] = np_softmax(mc_dropout_prediction_output[i, ...]) transpose_dims = (2, 1, 0) mc_dropout_prediction_output = mc_dropout_prediction_output.transpose(transpose_dims) mc_dropout_prediction_probs = mc_dropout_prediction_probs.transpose(transpose_dims) for i in range(mc_dropout_prediction_output.shape[0]): prediction_output_file = '{}_class{}_{}'.format(basename, i, 'output.npy') np.savetxt(prediction_output_file, mc_dropout_prediction_output[i, ...]) prediction_probs_file = '{}_class{}_{}'.format(basename, i, 'probs.npy') np.savetxt(prediction_probs_file, mc_dropout_prediction_probs[i, ...])