def init_db(): import models import helpers engine, db_session = db_connect() models.Base.metadata.create_all(bind=engine) tag_highlight = models.Tag() tag_highlight.name = "Tag 1" tag_highlight.color = "#00aabb" db_session.add(tag_highlight) tag_exclude = models.Tag() tag_exclude.name = "Tag 2" tag_exclude.color = "#bc5a45" db_session.add(tag_exclude) config = models.Config() config.key = "workdir" config.set_value(str(settings.work_dir)) db_session.add(config) db_session.commit() user = models.User() user.username = "******" user.name = "local" user.set_password("!@#$%¨&¨BFGFGBFffglkdfk*") db_session.add(user) db_session.commit()
def main(config='config/finetune/agnews/train.json'): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) TaskDataset = data.get_class( cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset( cfg_data.data_file[cfg.mode], pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) dataset = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(dataset, batch_size=cfg_optim.batch_size, shuffle=True) classifier = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels)) optimizer = optim.optim4GPU(cfg_optim, classifier) train_loop = trainer.TrainLoop(cfg_optim, classifier, data_iter, optimizer, cfg.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = nn.CrossEntropyLoss()(logits, label_id) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result if cfg.mode == "train": train_loop.train(get_loss, cfg.model_file, cfg.pretrain_file) print("Training has been done properly.") elif cfg.mode == "eval": results = train_loop.eval(evaluate, cfg.model_file) total_accuracy = torch.cat(results).mean().item() print(f"Accuracy: {total_accuracy}")
def test_get_model_raises_MultipleMatches_when_several_patterns_match( self): model1 = models.Model(name='switch1', pattern='foo', oids={}) model2 = models.Model(name='switch2', pattern='bar', oids={}) config = models.Config(models=(model1, model2), default_oids={}) with self.assertRaises(models.MultipleMatches): config.get_model('sysDescr includes both foo and bar patterns')
def get_user_config(): """Returns the current user-defined configuration from the database""" config = models.Config.query.get(0) if config is None: config = models.Config() config.id = 0 db.session.add(config) return config
def post(self, user): vc_sid = self.request.get('vc_sid') yahoo_sid = self.request.get('yahoo_sid') rakuten_aid = self.request.get('rakuten_aid') amazon_aid = self.request.get('amazon_aid') kumapon_aid = self.request.get('kumapon_aid') conf = models.Config( key_name=user['id'], vc_sid=vc_sid, yahoo_sid=yahoo_sid, rakuten_aid=rakuten_aid, amazon_aid=amazon_aid, kumapon_aid=kumapon_aid, ) conf.put() return self.redirect(self.url_for('admin_conf'))
def main(config='config/finetune/agnews/train.json'): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) ### Prepare Dataset and Preprocessing ### TaskDataset = data.get_class(cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset(cfg_data.data_file[cfg.mode], pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) tensors = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False) ### Fetch Teacher's output and put it into the dataset ### def fetch_logits(model): def get_logits(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) return 0.0, logits train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, None, None, get_device()) results = torch.cat(train_loop.eval(get_logits, cfg.model_file)) return results if cfg.mode == "train": print("Fetching teacher's output...") teacher = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels)) teacher.load_state_dict(torch.load(cfg.model_file)) # use trained model with torch.no_grad(): teacher_logits = fetch_logits(teacher) tensors = TensorDataset(teacher_logits, *dataset.get_tensors()) # To Tensors data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False) ### Models ### model = models.BlendCNN(cfg_model, len(TaskDataset.labels)) checkpoint.load_embedding(model.embed, cfg.pretrain_file) optimizer = optim.optim4GPU(cfg_optim, model) train_loop = trainer.TrainLoop( cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device() ) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor teacher_logits, input_ids, segment_ids, input_mask, label_id = batch T = 1.0 logits = model(input_ids, segment_ids, input_mask) loss = 0.1*nn.CrossEntropyLoss()(logits, label_id) loss += 0.9*nn.KLDivLoss()( F.log_softmax(logits/T, dim=1), F.softmax(teacher_logits/T, dim=1) ) #loss = 0.9*nn.MSELoss()(logits, teacher_logits) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result if cfg.mode == "train": train_loop.train(get_loss, None, None) # not use pretrain file print("Training has been done properly.") elif cfg.mode == "eval": results = train_loop.eval(evaluate, cfg.model_file) total_accuracy = torch.cat(results).mean().item() print(f"Accuracy: {total_accuracy}")
def main(train_cfg='config/pretrain.json', model_cfg='config/bert_base.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', save_dir='../exp/bert/pretrain', log_dir='../exp/bert/pretrain/runs', max_len=512, max_pred=20, mask_prob=0.15): cfg = Config(**json.load(open(config, "r"))) cfg_optim = train.Config(**json.load(open(cfg.cfg_optim, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) set_seeds(cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, cfg.batch_size, tokenize, max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss() optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir, get_device()) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX def get_loss(model, batch, global_step): # make sure loss is tensor input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos) loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM loss_lm = (loss_lm * masked_weights.float()).mean() loss_clsf = criterion2(logits_clsf, is_next) # for sentence classification writer.add_scalars( 'data/scalar_group', { 'loss_lm': loss_lm.item(), 'loss_clsf': loss_clsf.item(), 'loss_total': (loss_lm + loss_clsf).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_lm + loss_clsf trainer.train(get_loss, model_file, None, data_parallel)
def main(config='config/blendcnn/mrpc/eval.json', args=None): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) TaskDataset = data.get_class( cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset( args.dataset_location, pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) dataset = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) model = models.BlendCNN(cfg_model, len(TaskDataset.labels)) checkpoint.load_embedding(model.embed, cfg.pretrain_file) optimizer = optim.optim4GPU(cfg_optim, model) train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = nn.CrossEntropyLoss()(logits, label_id) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result class Bert_DataLoader(object): def __init__(self, loader=None, model_type=None, device='cpu', batch_size=1): self.loader = loader self.model_type = model_type self.device = device self.batch_size = batch_size def __iter__(self): for batch in self.loader: batch = tuple(t.to(self.device) for t in batch) outputs = { 'output_all': (batch[0], batch[1], batch[2]), 'labels': batch[3] } yield outputs['output_all'], outputs['labels'] def benchmark(model): total_samples = 0 total_time = 0 index = 0 class RandomDataset(object): def __init__(self, size, shape): self.len = size self.input_ids = torch.randint(low=0, high=30522, size=(size, shape), dtype=torch.int64) self.segment_ids = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.input_mask = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.data = (self.input_ids, self.segment_ids, self.input_mask) def __getitem__(self, index): return (self.data[0][index], self.data[1][index], self.data[2][index]) def __len__(self): return self.len rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128), batch_size=args.batch_size, shuffle=True) for batch in rand_loader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): input_ids, segment_ids, input_mask = batch _ = model(*batch) else: with torch.no_grad( ): # evaluation without gradient calculation input_ids, segment_ids, input_mask = batch _ = model(*batch) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f images/sec' % (throughput)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) def eval_func(model): results = [] # prediction results total_samples = 0 total_time = 0 index = 0 model.eval() eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) for batch, label in eval_dataloader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): accuracy, result = evaluate(model, (*batch, label)) else: with torch.no_grad( ): # evaluation without gradient calculation accuracy, result = evaluate(model, (*batch, label)) results.append(result) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic total_accuracy = torch.cat(results).mean().item() throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f samples/sec' % (throughput)) print('Accuracy: %.3f ' % (total_accuracy)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) return total_accuracy if cfg.mode == "train": train_loop.train(get_loss, cfg.model_file, None) # not use pretrain_file print("Training has been done properly.") elif cfg.mode == "eval": # results = train_loop.eval(evaluate, cfg.model_file) # total_accuracy = torch.cat(results).mean().item() # print(f"Accuracy: {total_accuracy}") if args.tune: import lpot from lpot import common # lpot tune model.load_state_dict(torch.load(args.input_model)) eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) quantizer = lpot.Quantization(args.tuned_yaml) quantizer.model = common.Model(model) quantizer.calib_dataloader = eval_dataloader quantizer.eval_func = eval_func q_model = quantizer() q_model.save(args.tuned_checkpoint) elif args.int8: from lpot.utils.pytorch import load int8_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) print(int8_model) if args.accuracy_only: eval_func(int8_model) elif args.benchmark: benchmark(int8_model) else: model.load_state_dict(torch.load(args.input_model)) print(model) if args.accuracy_only: eval_func(model) elif args.benchmark: benchmark(model)
def test_get_model_raises_UnknownSwitchModel_when_no_patterns_match(self): model = models.Model(name='ok', pattern='foo', oids={}) config = models.Config(models=(model, ), default_oids={}) with self.assertRaises(models.UnknownSwitchModel): config.get_model('sysDescr value that does not contain pattern')
def test_init_raises_BadConfiguration_when_parmaters_not_given(self): with self.assertRaises(models.BadConfiguration): models.Config(models=None, default_oids=None, line='none')