def pretrain(args, config, X_all, lead_time_idx, input_dim=None, onehot_dim=None): model = ModelWrapper(args, config, args.use_onehot, input_dim, onehot_dim, lead_time_idx) model.train((lead_time_idx, X_all)) return model
def file_infer_test(model_file): # fit_gen_test = FitGeneratorWrapper(type=conf.type, file_vocab=conf.VOCAB_FILE, file_corpus=conf.TEST_FILE, # batch_size=conf.batch_size, max_len=conf.maxlen, vector_dim=conf.vector_dim, # pretrain_vocab=conf.pretrain_vocab, label_dict_file=conf.LABEL_DICT_FILE) path_data = '../data/new_all.csv.test' list_data = [] with open(path_data, 'r') as f: for i in f.read().splitlines(): list_data.append(i.split('\t')) fit_gen_train = FitGeneratorWrapper(type=conf.type, file_vocab=conf.VOCAB_FILE, file_corpus=conf.TRAIN_FILE, batch_size=conf.batch_size, max_len=conf.maxlen, vector_dim=conf.vector_dim, ner_vocab=conf.pretrain_vocab, label_dict_file=conf.LABEL_DICT_FILE) vocab_size_train = fit_gen_train.get_vocab_size() sentences_train, labels_train = fit_gen_train.read_raw_corpus(file_corpus=conf.TRAIN_FILE) sentences_test, labels_test = fit_gen_train.read_raw_corpus(file_corpus=conf.TEST_FILE) model = ModelWrapper.model(conf, train=False, vocab_size=vocab_size_train, labels_num=0) model.load_weights(model_file, by_name=True) model.summary() vectors_train = __predict_vectors(model, sentences_train, conf.vector_dim) vectors_test= __predict_vectors(model, sentences_test, conf.vector_dim) dic_all, labels_list_after_set = gauss_change_data(vectors_train, labels_train) models = {} n_components = 3 model_dir = "/Users/chenhengxi/PycharmProjects/work2/sentence-encoding-qa/data/model" for domain in labels_list_after_set: modelx = GaussianMixture(n_components=n_components, covariance_type='diag', reg_covar=0.0001, max_iter=200, verbose=0, verbose_interval=1) data = np.array(dic_all[domain]) modelx.fit(data) models[domain] = modelx joblib.dump(modelx, "{0}/{1}.joblib".format(model_dir,domain)) final_dic = {} final_num=0 error=[] for i in range(len(vectors_test)): print(i) accept_scores = {} for domain in labels_list_after_set: models[domain] = joblib.load("{0}/{1}.joblib".format(model_dir,domain)) a=np.squeeze(vectors_test[i]) #vectors_test[i]=a.reshape(-1, 1) point_array = models[domain].score_samples(a.reshape(1,conf.vector_dim)) point = point_array[0] accept_scores[str(point)] = domain list_to_max = [] for num in accept_scores: list_to_max.append(float(num)) max_num = max(list_to_max) label_final = accept_scores[str(max_num)] final_dic[str(vectors_test[i])] = label_final if list_data[i][1]!=label_final: final_num+=1 error.append([list_data[i][0],list_data[i][1],label_final]) print((1-final_num/len(vectors_test))) print(error)
def train(args, config, X_tra, X_val, input_dim=None, onehot_dim=None): if args.train_task == 'cancel': grader = Grader(X_val) model = CancelModel(args.can_model, config, args.filter_all, args.use_onehot) if args.train_all: X_tra = X_tra + X_val model.train(X_tra) # cacenl error rate a.k.a CER cer = grader.eval_cancel_error_rate(model, IsCancelModel=True) return model, cer elif args.train_task == 'adr' or args.train_task == 'revenue': grader = Grader(X_val) model = ModelWrapper(args, config, args.use_onehot, args.filter_all, input_dim, onehot_dim) if args.use_pretrain: pretrain_model = ModelWrapper(args, config, args.filter_all, args.use_onehot, input_dim, onehot_dim) pretrain_model.load('trained_models/pretrain.pkl') model.model.model = pretrain_model.model.model if args.train_all: X_tra = X_tra + X_val if args.verbose: model.train(X_tra, grader) else: model.train(X_tra) # revenue MAE a.k.a REV rev = grader.eval_revenue(model) mae = grader.eval_mae(model) return model, rev, mae elif args.train_task == 'label': grader = Grader(X_val) model = ModelWrapper(args, config, args.use_onehot, args.filter_all, input_dim, onehot_dim) if args.verbose: model.train(X_tra, grader) else: model.train(X_tra) rev = grader.eval_revenue(model) mae = grader.eval_mae(model) return model, rev, mae
def file_infer_test(self, model_file, values_file=None): sentences, labels = self._prepare_test_data() print( '***************************build model***************************' ) model = ModelWrapper.model(self.conf, train=False, vocab_size=self.vocab_size, labels_num=self.labels_num) model.summary() self._load_model(model_file, model) print( '***************************infer test***************************') indexs, values, all_values, indexs2, values2 = self.do_predict( model, sentences, vector_dim=self.conf.vector_dim, header=self.conf.predict_header, type=self.conf.type) correct_num = 0 for i in range(len(indexs)): if indexs[i] != labels[i]: labels[i] = 0 else: labels[i] = 1 correct_num += 1 print("validat set precise {}, error number {}".format( correct_num / len(labels), len(labels) - correct_num)) tpr, fpr, accuracy, best_thresholds = evaluate_best_threshold_value( values, labels, nrof_folds=10) tpr = np.mean(tpr) fpr = np.mean(fpr) accuracy = np.mean(accuracy) best_thresholds = np.mean(best_thresholds) print( "cosine: (正样本的召回率tp/(tp+fn))tpr={} (负样本的错误率fp/(fp+tn))fpr={} acc={} threshold={}" .format(tpr, fpr, accuracy, best_thresholds))
def file_infer_test(self, model_file, values_file=None): sentences, labels = self._prepare_test_data() print( '***************************build model***************************' ) model = ModelWrapper.model(self.conf, train=False, vocab_size=self.vocab_size, labels_num=1) model.summary() self._load_model(model_file, model) print( '***************************infer test***************************') # if os.path.exists(values_file): # print("load cache file " + values_file) # values = np.load(values_file) # else: values = self.do_predict(model, sentences, vector_dim=self.conf.vector_dim, header=self.conf.predict_header, type=self.conf.type) print("save cache file " + values_file) np.save(values_file, values) tpr, fpr, accuracy, best_thresholds = evaluate_best_threshold_value( values, labels, nrof_folds=10) tpr = np.mean(tpr) fpr = np.mean(fpr) accuracy = np.mean(accuracy) best_thresholds = np.mean(best_thresholds) print( "cosine: (正样本的召回率tp/(tp+fn))tpr={} (负样本的错误率fp/(fp+tn))fpr={} acc={} threshold={}" .format(tpr, fpr, accuracy, best_thresholds)) return best_thresholds
print('bert_sim max min: ', max(bert_sim), min(bert_sim)) fig = plt.figure() cset = plt.scatter(output[:, 0], output[:, 1], s=8, c=bert_sim, cmap='PuBu') plt.plot(output[opt['key_id'], 0], output[opt['key_id'], 1], 'r.') plt.colorbar(cset) plt.savefig( os.path.join(opt['model_save_dir'], 'plot', '%d_bert_sim' % opt['key_id'])) fig = plt.figure() plt.hist(bert_sim, bins=100) plt.savefig( os.path.join(opt['model_save_dir'], 'plot', '%d_bert_hist' % opt['key_id'])) model = ModelWrapper(opt, weibo2embid, eva=True) model.load(os.path.join(opt['model_save_dir'], 'best_model.pt')) clash_delta = [] for i in range(len(emb_matrix)): emb1 = torch.tensor([i]).cuda() emb2 = torch.tensor([opt['key_id']]).cuda() clash_delta.append(model.model.get_delta(emb1, emb2).item()) ceil = max(max(clash_delta), min(clash_delta) * (-1)) * 1.05 print(max(clash_delta), min(clash_delta)) fig = plt.figure() cset = plt.scatter(output[:, 0], output[:, 1], s=8, c=clash_delta,
pretraining_task_config["batch_size"] = args.batch_size if args.num_batch is not None: pretraining_task_config["validate_batches_per_epoch"] = args.num_batch device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print( json.dumps([model_config, pretraining_task_config, dataset_config], indent=4)) ########################### Loading Model ########################### data = DatasetProcessor(dataset_root_folder, dataset_config, train=False) model = ModelWrapper(model_config) print(model) model = model.cuda() model = nn.DataParallel(model, device_ids=device_ids) pretrain_data = BertPreTrainDatasetWrapper(data) pretrain_dataloader = torch.utils.data.DataLoader( pretrain_data, batch_size=pretraining_task_config["batch_size"], pin_memory=True) pretrain_dataloader_iter = iter(pretrain_dataloader) ########################### Validate Model ###########################
opt['batch_size'], opt, weibo2embid=weibo2embid, evaluation=True) # test_batch = DataLoader(os.path.join(opt['data_dir'], 'test.csv'), # opt['batch_size'], # opt, # weibo2embid=weibo2embid, # evaluation=True) if not os.path.exists(opt['model_save_dir']): os.mkdir(opt['model_save_dir']) weibo2embid = train_batch.weibo2embid model = ModelWrapper(opt, weibo2embid, train_batch.retw_prob) global_step = 0 # current_lr = opt['lr'] max_steps = len(train_batch) * opt['num_epoch'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' dev_f1_history = [] for epoch in range(1, opt['num_epoch'] + 1): train_loss = 0 current_lr = model.optimizer.param_groups[0]['lr'] for i, batch in enumerate(train_batch):
import json import os from model import ModelWrapper BUCKET_NAME = os.environ["BUCKET_NAME"] # load outside of handler for warm start model_wrapper = ModelWrapper(bucket_name=BUCKET_NAME) model_wrapper.load_model() def handler(event, context): print("Event received:", event) data = event["body"] if isinstance(data, str): data = json.loads(data) print("Data received:", data) label, proba = model_wrapper.predict(data=data) body = { "prediction": { "label": label, "probability": round(proba, 4), }, } response = { "statusCode": 200,
def __init__(self): self.mw = ModelWrapper.load()
with open(opt['idx_dict'], 'rb') as fin: weibo2embid = pickle.load(fin) # train_batch = DataLoader(os.path.join(opt['data_dir'], 'train.csv'), # opt['batch_size'], # opt, # weibo2embid=weibo2embid, # evaluation=False) dev_batch = DataLoader(os.path.join(opt['data_dir'], 'dev.csv'), opt['batch_size'], opt, weibo2embid=weibo2embid, evaluation=True) model = ModelWrapper(opt, weibo2embid, eva=True) model.load(os.path.join(opt['model_save_dir'], 'best_model.pt')) all_probs = [] all_preds = [] for i, b in enumerate(dev_batch): preds, probs, _ = model.predict(b, thres=0.5) all_probs += probs all_preds += preds acc, prec, rec, dev_f1 = metrics(dev_batch.gold(), all_preds) print('acc: {}, prec: {}, rec: {}, f1: {}\n'.format(acc, prec, rec, dev_f1)) auc, prec, rec, f1, best_thres = tune_thres_new(dev_batch.gold(), all_probs) print('auc: {}, prec: {}, rec: {}, f1: {}, best_thres: {}'.format(auc, prec, rec, f1, best_thres))
dataset_root_folder = os.path.join(curr_path, "datasets", "ALBERT-pretrain") if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) checkpoint_path = utils.get_last_checkpoint(checkpoint_dir) device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print(json.dumps([model_config, pretraining_task_config, dataset_config], indent = 4)) ########################### Loading Model ########################### data = DatasetProcessor(dataset_root_folder, dataset_config) model = ModelWrapper(model_config) print(model) num_parameter = 0 for weight in model.parameters(): print(weight.size()) size = 1 for d in weight.size(): size *= d num_parameter += size print(f"num_parameter: {num_parameter}") model = model.cuda() model = nn.DataParallel(model, device_ids = device_ids) if "from_cp" in config and checkpoint_path is None:
def find_best_threshold_for_second(self, model_file): sentences, labels = self._prepare_test_data() print( '***************************build model***************************' ) model = ModelWrapper.model(self.conf, train=False, vocab_size=self.vocab_size, labels_num=self.labels_num) model.summary() self._load_model(model_file, model) print( '***************************infer test***************************') indexs, values, all_values, indexs2, values2 = self.do_predict( model, sentences, vector_dim=self.conf.vector_dim, header=self.conf.predict_header, type=self.conf.type) ground_truth = labels.copy() correct_num = 0 err_min_second = 1.0 #错误的case中第二个的最小值 err_max_gap = 0.0 #错误的case中第一个和第二个的最大差距 correct_max_second = 0.0 #正确的case中第二个的最大值 correct_min_gap = 1.0 #正确的case中第一个和第二个的最小差距 for i in range(len(indexs)): if indexs[i] != labels[i]: labels[i] = 0 if err_min_second > values2[i]: err_min_second = values2[i] if err_max_gap < values[i] - values2[i]: err_max_gap = values[i] - values2[i] else: labels[i] = 1 correct_num += 1 if correct_max_second < values2[i]: correct_max_second = values2[i] if correct_min_gap > values[i] - values2[i]: correct_min_gap = values[i] - values2[i] print("validat set precise {}, error number {}".format( correct_num / len(labels), len(labels) - correct_num)) print("err_min_second {}, err_max_gap {}".format( err_min_second, err_max_gap)) print("correct_max_second {}, correct_min_gap {}".format( correct_max_second, correct_min_gap)) best_gap, best_gap_rate = self.evaluate_gaps(ground_truth, indexs, values, indexs2, values2, correct_min_gap, err_max_gap) print("best_gap {}, best_gap_rate {}".format(best_gap, best_gap_rate)) best_threshold, best_threshold_rate = self.evaluate_second_thresholds( ground_truth, indexs, values2, err_min_second, correct_max_second) print("best_threshold {}, best_threshold_rate {}".format( best_threshold, best_threshold_rate))
from model import ModelWrapper model_wrapper = ModelWrapper() model_wrapper.train()
eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) # model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) model = ModelWrapper(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied, channel=args.channel, pooling=args.pooling) if args.cuda: model.cuda() if not args.noglove: emb_torch = 'sst_embed1.pth' emb_torch2 = 'sst_embed2.pth' emb_vector_path = args.embedding_one emb_vector_path2 = args.embedding_two assert os.path.isfile(emb_vector_path + '.txt') # assert os.path.isfile(emb_vector_path2 + '.txt')
import json import sys from model import ModelWrapper model_wrapper = ModelWrapper() model_wrapper.load_model() data = json.loads(sys.argv[1]) print(f"Data: {data}") prediction = model_wrapper.predict(data=data) print(f"Prediction: {prediction}")
print( json.dumps([model_config, downsteam_task_config, dataset_config], indent=4)) downsteam_metric = downsteam_task_config["metric"] downsteam_task = downsteam_task_config["task"] log_file_name = f"{args.checkpoint}-downsteam-{downsteam_task}.log".replace( " ", "_") print(f"Log file: {log_file_name}", flush=True) log_f = open(os.path.join(checkpoint_dir, log_file_name), "w") ########################### Load Model ########################### data = DatasetProcessor(dataset_root_folder, dataset_config) model = ModelWrapper(model_config) checkpoint = torch.load(checkpoint_path, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) print("Model restored", checkpoint_path) model.init_sen_class() model = model.cuda() model = nn.DataParallel(model, device_ids=device_ids) optimizer = torch.optim.AdamW(model.parameters(), lr=downsteam_task_config["learning_rate"], betas=(0.9, 0.999), eps=1e-6,
def loadModel(): model = ModelWrapper() result = model.loadModel() return model