Ejemplo n.º 1
0
def pretrain(args,
             config,
             X_all,
             lead_time_idx,
             input_dim=None,
             onehot_dim=None):
    model = ModelWrapper(args, config, args.use_onehot, input_dim, onehot_dim,
                         lead_time_idx)
    model.train((lead_time_idx, X_all))
    return model
Ejemplo n.º 2
0
def file_infer_test(model_file):
    # fit_gen_test = FitGeneratorWrapper(type=conf.type, file_vocab=conf.VOCAB_FILE, file_corpus=conf.TEST_FILE,
    #                                    batch_size=conf.batch_size, max_len=conf.maxlen, vector_dim=conf.vector_dim,
    #                                    pretrain_vocab=conf.pretrain_vocab, label_dict_file=conf.LABEL_DICT_FILE)
    path_data = '../data/new_all.csv.test'

    list_data = []
    with open(path_data, 'r') as f:
        for i in f.read().splitlines():
            list_data.append(i.split('\t'))
    fit_gen_train = FitGeneratorWrapper(type=conf.type, file_vocab=conf.VOCAB_FILE, file_corpus=conf.TRAIN_FILE,
                                        batch_size=conf.batch_size, max_len=conf.maxlen, vector_dim=conf.vector_dim,
                                        ner_vocab=conf.pretrain_vocab, label_dict_file=conf.LABEL_DICT_FILE)
    vocab_size_train = fit_gen_train.get_vocab_size()
    sentences_train, labels_train = fit_gen_train.read_raw_corpus(file_corpus=conf.TRAIN_FILE)
    sentences_test, labels_test = fit_gen_train.read_raw_corpus(file_corpus=conf.TEST_FILE)

    model = ModelWrapper.model(conf, train=False, vocab_size=vocab_size_train, labels_num=0)
    model.load_weights(model_file, by_name=True)
    model.summary()
    vectors_train = __predict_vectors(model, sentences_train, conf.vector_dim)
    vectors_test= __predict_vectors(model, sentences_test, conf.vector_dim)

    dic_all, labels_list_after_set = gauss_change_data(vectors_train, labels_train)
    models = {}
    n_components = 3
    model_dir = "/Users/chenhengxi/PycharmProjects/work2/sentence-encoding-qa/data/model"
    for domain in labels_list_after_set:
        modelx = GaussianMixture(n_components=n_components, covariance_type='diag', reg_covar=0.0001, max_iter=200,
                                verbose=0, verbose_interval=1)
        data = np.array(dic_all[domain])
        modelx.fit(data)
        models[domain] = modelx
        joblib.dump(modelx, "{0}/{1}.joblib".format(model_dir,domain))
    final_dic = {}
    final_num=0
    error=[]
    for i in range(len(vectors_test)):
        print(i)
        accept_scores = {}
        for domain in labels_list_after_set:
            models[domain] = joblib.load("{0}/{1}.joblib".format(model_dir,domain))
            a=np.squeeze(vectors_test[i])
            #vectors_test[i]=a.reshape(-1, 1)
            point_array = models[domain].score_samples(a.reshape(1,conf.vector_dim))
            point = point_array[0]
            accept_scores[str(point)] = domain
        list_to_max = []
        for num in accept_scores:
            list_to_max.append(float(num))
        max_num = max(list_to_max)
        label_final = accept_scores[str(max_num)]
        final_dic[str(vectors_test[i])] = label_final
        if list_data[i][1]!=label_final:
            final_num+=1
            error.append([list_data[i][0],list_data[i][1],label_final])
    print((1-final_num/len(vectors_test)))
    print(error)
Ejemplo n.º 3
0
def train(args, config, X_tra, X_val, input_dim=None, onehot_dim=None):
    if args.train_task == 'cancel':
        grader = Grader(X_val)
        model = CancelModel(args.can_model, config, args.filter_all,
                            args.use_onehot)
        if args.train_all:
            X_tra = X_tra + X_val

        model.train(X_tra)

        # cacenl error rate a.k.a CER
        cer = grader.eval_cancel_error_rate(model, IsCancelModel=True)
        return model, cer

    elif args.train_task == 'adr' or args.train_task == 'revenue':
        grader = Grader(X_val)
        model = ModelWrapper(args, config, args.use_onehot, args.filter_all,
                             input_dim, onehot_dim)

        if args.use_pretrain:
            pretrain_model = ModelWrapper(args, config, args.filter_all,
                                          args.use_onehot, input_dim,
                                          onehot_dim)
            pretrain_model.load('trained_models/pretrain.pkl')
            model.model.model = pretrain_model.model.model

        if args.train_all:
            X_tra = X_tra + X_val

        if args.verbose:
            model.train(X_tra, grader)
        else:
            model.train(X_tra)

        # revenue MAE a.k.a REV
        rev = grader.eval_revenue(model)
        mae = grader.eval_mae(model)
        return model, rev, mae

    elif args.train_task == 'label':
        grader = Grader(X_val)
        model = ModelWrapper(args, config, args.use_onehot, args.filter_all,
                             input_dim, onehot_dim)

        if args.verbose:
            model.train(X_tra, grader)
        else:
            model.train(X_tra)

        rev = grader.eval_revenue(model)
        mae = grader.eval_mae(model)
        return model, rev, mae
Ejemplo n.º 4
0
    def file_infer_test(self, model_file, values_file=None):
        sentences, labels = self._prepare_test_data()

        print(
            '***************************build model***************************'
        )
        model = ModelWrapper.model(self.conf,
                                   train=False,
                                   vocab_size=self.vocab_size,
                                   labels_num=self.labels_num)
        model.summary()
        self._load_model(model_file, model)

        print(
            '***************************infer test***************************')
        indexs, values, all_values, indexs2, values2 = self.do_predict(
            model,
            sentences,
            vector_dim=self.conf.vector_dim,
            header=self.conf.predict_header,
            type=self.conf.type)

        correct_num = 0
        for i in range(len(indexs)):
            if indexs[i] != labels[i]:
                labels[i] = 0
            else:
                labels[i] = 1
                correct_num += 1

        print("validat set precise {}, error number {}".format(
            correct_num / len(labels),
            len(labels) - correct_num))

        tpr, fpr, accuracy, best_thresholds = evaluate_best_threshold_value(
            values, labels, nrof_folds=10)
        tpr = np.mean(tpr)
        fpr = np.mean(fpr)
        accuracy = np.mean(accuracy)
        best_thresholds = np.mean(best_thresholds)
        print(
            "cosine: (正样本的召回率tp/(tp+fn))tpr={} (负样本的错误率fp/(fp+tn))fpr={} acc={} threshold={}"
            .format(tpr, fpr, accuracy, best_thresholds))
Ejemplo n.º 5
0
    def file_infer_test(self, model_file, values_file=None):
        sentences, labels = self._prepare_test_data()

        print(
            '***************************build model***************************'
        )
        model = ModelWrapper.model(self.conf,
                                   train=False,
                                   vocab_size=self.vocab_size,
                                   labels_num=1)
        model.summary()
        self._load_model(model_file, model)

        print(
            '***************************infer test***************************')
        # if os.path.exists(values_file):
        #     print("load cache file " + values_file)
        #     values = np.load(values_file)
        # else:
        values = self.do_predict(model,
                                 sentences,
                                 vector_dim=self.conf.vector_dim,
                                 header=self.conf.predict_header,
                                 type=self.conf.type)
        print("save cache file " + values_file)
        np.save(values_file, values)

        tpr, fpr, accuracy, best_thresholds = evaluate_best_threshold_value(
            values, labels, nrof_folds=10)
        tpr = np.mean(tpr)
        fpr = np.mean(fpr)
        accuracy = np.mean(accuracy)
        best_thresholds = np.mean(best_thresholds)
        print(
            "cosine: (正样本的召回率tp/(tp+fn))tpr={} (负样本的错误率fp/(fp+tn))fpr={} acc={} threshold={}"
            .format(tpr, fpr, accuracy, best_thresholds))
        return best_thresholds
Ejemplo n.º 6
0
print('bert_sim max min: ', max(bert_sim), min(bert_sim))

fig = plt.figure()
cset = plt.scatter(output[:, 0], output[:, 1], s=8, c=bert_sim, cmap='PuBu')
plt.plot(output[opt['key_id'], 0], output[opt['key_id'], 1], 'r.')
plt.colorbar(cset)
plt.savefig(
    os.path.join(opt['model_save_dir'], 'plot', '%d_bert_sim' % opt['key_id']))

fig = plt.figure()
plt.hist(bert_sim, bins=100)
plt.savefig(
    os.path.join(opt['model_save_dir'], 'plot',
                 '%d_bert_hist' % opt['key_id']))

model = ModelWrapper(opt, weibo2embid, eva=True)
model.load(os.path.join(opt['model_save_dir'], 'best_model.pt'))

clash_delta = []
for i in range(len(emb_matrix)):
    emb1 = torch.tensor([i]).cuda()
    emb2 = torch.tensor([opt['key_id']]).cuda()
    clash_delta.append(model.model.get_delta(emb1, emb2).item())

ceil = max(max(clash_delta), min(clash_delta) * (-1)) * 1.05
print(max(clash_delta), min(clash_delta))
fig = plt.figure()
cset = plt.scatter(output[:, 0],
                   output[:, 1],
                   s=8,
                   c=clash_delta,
Ejemplo n.º 7
0
    pretraining_task_config["batch_size"] = args.batch_size

if args.num_batch is not None:
    pretraining_task_config["validate_batches_per_epoch"] = args.num_batch

device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(
    json.dumps([model_config, pretraining_task_config, dataset_config],
               indent=4))

########################### Loading Model ###########################

data = DatasetProcessor(dataset_root_folder, dataset_config, train=False)
model = ModelWrapper(model_config)
print(model)

model = model.cuda()
model = nn.DataParallel(model, device_ids=device_ids)

pretrain_data = BertPreTrainDatasetWrapper(data)
pretrain_dataloader = torch.utils.data.DataLoader(
    pretrain_data,
    batch_size=pretraining_task_config["batch_size"],
    pin_memory=True)

pretrain_dataloader_iter = iter(pretrain_dataloader)

########################### Validate Model ###########################
Ejemplo n.º 8
0
                       opt['batch_size'],
                       opt,
                       weibo2embid=weibo2embid,
                       evaluation=True)
# test_batch = DataLoader(os.path.join(opt['data_dir'], 'test.csv'),
#                    opt['batch_size'],
#                    opt,
#                    weibo2embid=weibo2embid,
#                    evaluation=True)

if not os.path.exists(opt['model_save_dir']):
    os.mkdir(opt['model_save_dir'])

weibo2embid = train_batch.weibo2embid

model = ModelWrapper(opt, weibo2embid, train_batch.retw_prob)

global_step = 0
# current_lr = opt['lr']
max_steps = len(train_batch) * opt['num_epoch']

global_start_time = time.time()
format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

dev_f1_history = []

for epoch in range(1, opt['num_epoch'] + 1):
    train_loss = 0
    current_lr = model.optimizer.param_groups[0]['lr']

    for i, batch in enumerate(train_batch):
Ejemplo n.º 9
0
import json
import os

from model import ModelWrapper

BUCKET_NAME = os.environ["BUCKET_NAME"]

# load outside of handler for warm start
model_wrapper = ModelWrapper(bucket_name=BUCKET_NAME)
model_wrapper.load_model()


def handler(event, context):
    print("Event received:", event)

    data = event["body"]
    if isinstance(data, str):
        data = json.loads(data)
    print("Data received:", data)

    label, proba = model_wrapper.predict(data=data)

    body = {
        "prediction": {
            "label": label,
            "probability": round(proba, 4),
        },
    }

    response = {
        "statusCode": 200,
Ejemplo n.º 10
0
 def __init__(self):
     self.mw = ModelWrapper.load()
Ejemplo n.º 11
0
with open(opt['idx_dict'], 'rb') as fin:
    weibo2embid = pickle.load(fin)

# train_batch = DataLoader(os.path.join(opt['data_dir'], 'train.csv'),
#                    opt['batch_size'],
#                    opt,
#                    weibo2embid=weibo2embid,
#                    evaluation=False)
dev_batch = DataLoader(os.path.join(opt['data_dir'], 'dev.csv'),
                   opt['batch_size'],
                   opt,
                   weibo2embid=weibo2embid,
                   evaluation=True)

model = ModelWrapper(opt, weibo2embid, eva=True)
model.load(os.path.join(opt['model_save_dir'], 'best_model.pt'))

all_probs = []
all_preds = []
for i, b in enumerate(dev_batch):
    preds, probs, _ = model.predict(b, thres=0.5)
    all_probs += probs
    all_preds += preds

acc, prec, rec, dev_f1 = metrics(dev_batch.gold(), all_preds)
print('acc: {}, prec: {}, rec: {}, f1: {}\n'.format(acc, prec, rec, dev_f1))

auc, prec, rec, f1, best_thres = tune_thres_new(dev_batch.gold(), all_probs)
print('auc: {}, prec: {}, rec: {}, f1: {}, best_thres: {}'.format(auc, prec, rec, f1, best_thres))
Ejemplo n.º 12
0
dataset_root_folder = os.path.join(curr_path, "datasets", "ALBERT-pretrain")

if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)

checkpoint_path = utils.get_last_checkpoint(checkpoint_dir)

device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(json.dumps([model_config, pretraining_task_config, dataset_config], indent = 4))

########################### Loading Model ###########################

data = DatasetProcessor(dataset_root_folder, dataset_config)
model = ModelWrapper(model_config)
print(model)

num_parameter = 0
for weight in model.parameters():
    print(weight.size())
    size = 1
    for d in weight.size():
        size *= d
    num_parameter += size
print(f"num_parameter: {num_parameter}")

model = model.cuda()
model = nn.DataParallel(model, device_ids = device_ids)

if "from_cp" in config and checkpoint_path is None:
Ejemplo n.º 13
0
    def find_best_threshold_for_second(self, model_file):
        sentences, labels = self._prepare_test_data()

        print(
            '***************************build model***************************'
        )
        model = ModelWrapper.model(self.conf,
                                   train=False,
                                   vocab_size=self.vocab_size,
                                   labels_num=self.labels_num)
        model.summary()
        self._load_model(model_file, model)

        print(
            '***************************infer test***************************')
        indexs, values, all_values, indexs2, values2 = self.do_predict(
            model,
            sentences,
            vector_dim=self.conf.vector_dim,
            header=self.conf.predict_header,
            type=self.conf.type)

        ground_truth = labels.copy()
        correct_num = 0
        err_min_second = 1.0  #错误的case中第二个的最小值
        err_max_gap = 0.0  #错误的case中第一个和第二个的最大差距
        correct_max_second = 0.0  #正确的case中第二个的最大值
        correct_min_gap = 1.0  #正确的case中第一个和第二个的最小差距
        for i in range(len(indexs)):
            if indexs[i] != labels[i]:
                labels[i] = 0
                if err_min_second > values2[i]:
                    err_min_second = values2[i]
                if err_max_gap < values[i] - values2[i]:
                    err_max_gap = values[i] - values2[i]
            else:
                labels[i] = 1
                correct_num += 1
                if correct_max_second < values2[i]:
                    correct_max_second = values2[i]
                if correct_min_gap > values[i] - values2[i]:
                    correct_min_gap = values[i] - values2[i]

        print("validat set precise {}, error number {}".format(
            correct_num / len(labels),
            len(labels) - correct_num))
        print("err_min_second {}, err_max_gap {}".format(
            err_min_second, err_max_gap))
        print("correct_max_second {}, correct_min_gap {}".format(
            correct_max_second, correct_min_gap))

        best_gap, best_gap_rate = self.evaluate_gaps(ground_truth, indexs,
                                                     values, indexs2, values2,
                                                     correct_min_gap,
                                                     err_max_gap)
        print("best_gap {}, best_gap_rate {}".format(best_gap, best_gap_rate))

        best_threshold, best_threshold_rate = self.evaluate_second_thresholds(
            ground_truth, indexs, values2, err_min_second, correct_max_second)
        print("best_threshold {}, best_threshold_rate {}".format(
            best_threshold, best_threshold_rate))
Ejemplo n.º 14
0
from model import ModelWrapper


model_wrapper = ModelWrapper()
model_wrapper.train()
Ejemplo n.º 15
0
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
# model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied)
model = ModelWrapper(args.model,
                     ntokens,
                     args.emsize,
                     args.nhid,
                     args.nlayers,
                     args.dropout,
                     args.tied,
                     channel=args.channel,
                     pooling=args.pooling)

if args.cuda:
    model.cuda()

if not args.noglove:
    emb_torch = 'sst_embed1.pth'
    emb_torch2 = 'sst_embed2.pth'
    emb_vector_path = args.embedding_one
    emb_vector_path2 = args.embedding_two
    assert os.path.isfile(emb_vector_path + '.txt')
    # assert os.path.isfile(emb_vector_path2 + '.txt')
Ejemplo n.º 16
0
import json
import sys

from model import ModelWrapper


model_wrapper = ModelWrapper()
model_wrapper.load_model()

data = json.loads(sys.argv[1])
print(f"Data: {data}")

prediction = model_wrapper.predict(data=data)
print(f"Prediction: {prediction}")
Ejemplo n.º 17
0
print(
    json.dumps([model_config, downsteam_task_config, dataset_config],
               indent=4))

downsteam_metric = downsteam_task_config["metric"]
downsteam_task = downsteam_task_config["task"]

log_file_name = f"{args.checkpoint}-downsteam-{downsteam_task}.log".replace(
    " ", "_")
print(f"Log file: {log_file_name}", flush=True)
log_f = open(os.path.join(checkpoint_dir, log_file_name), "w")

########################### Load Model ###########################

data = DatasetProcessor(dataset_root_folder, dataset_config)
model = ModelWrapper(model_config)

checkpoint = torch.load(checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
print("Model restored", checkpoint_path)

model.init_sen_class()

model = model.cuda()

model = nn.DataParallel(model, device_ids=device_ids)

optimizer = torch.optim.AdamW(model.parameters(),
                              lr=downsteam_task_config["learning_rate"],
                              betas=(0.9, 0.999),
                              eps=1e-6,
Ejemplo n.º 18
0
def loadModel():
	model = ModelWrapper()
	result = model.loadModel()
	return model