def MakeTable(): print(args.csvname) table = datasets.LoadDmv(args.csvname) # modify oracle_est = estimators_lib.Oracle(table) if args.run_bn: return table, common.TableDataset(table), oracle_est return table, None, oracle_est
def MakeTable(): assert args.dataset in ['dmv-tiny', 'dmv'] if args.dataset == 'dmv-tiny': table = datasets.LoadDmv('dmv-tiny.csv') elif args.dataset == 'dmv': table = datasets.LoadDmv() oracle_est = estimators_lib.Oracle(table) if args.run_bn: return table, common.TableDataset(table), oracle_est return table, None, oracle_est
def MakeTable(): if args.dataset == 'dmv-tiny': table = datasets.LoadDmv('dmv-tiny.csv') elif args.dataset == 'dmv': table = datasets.LoadDmv(args.version + '.csv') # modify elif args.dataset == 'PF': table = datasets.LoadPF(args.version + '.csv') # modify oracle_est = estimators_lib.Oracle(table) if args.run_bn: return table, common.TableDataset(table), oracle_est return table, None, oracle_est
def MakeTableDataset(self, table): train_data = common.TableDataset(table) if self.factorize: train_data = common.FactorizedTable( train_data, word_size_bits=self.word_size_bits) return train_data
def TrainTask(seed=0): torch.manual_seed(0) np.random.seed(0) os.environ['CUDA_VISIBLE_DEVICES'] = '1' table = datasets.LoadDmv(args.csvname) table_bits = Entropy( table, table.data.fillna(value=0).groupby([c.name for c in table.columns ]).size(), [2])[0] fixed_ordering = None if args.order is not None: print('Using passed-in order:', args.order) fixed_ordering = args.order print(table.data.info()) table_train = table if args.heads > 0: model = MakeTransformer(cols_to_train=table.columns, fixed_ordering=fixed_ordering, seed=seed) else: if args.dataset in ['dmv-tiny', 'dmv', 'forest', 'power']: model = MakeMade( scale=args.fc_hiddens, cols_to_train=table.columns, seed=seed, fixed_ordering=fixed_ordering, ) else: assert False, args.dataset mb = ReportModel(model) if not isinstance(model, transformer.Transformer): print('Applying InitWeight()') model.apply(InitWeight) if isinstance(model, transformer.Transformer): opt = torch.optim.Adam( list(model.parameters()), 2e-4, betas=(0.9, 0.98), eps=1e-9, ) else: opt = torch.optim.Adam(list(model.parameters()), 2e-4) bs = args.bs log_every = 200 train_data = common.TableDataset(table_train) train_losses = [] train_start = time.time() for epoch in range(args.epochs): mean_epoch_train_loss = RunEpoch('train', model, opt, train_data=train_data, val_data=train_data, batch_size=bs, epoch_num=epoch, log_every=log_every, table_bits=table_bits) if epoch % 1 == 0: print('epoch {} train loss {:.4f} nats / {:.4f} bits'.format( epoch, mean_epoch_train_loss, mean_epoch_train_loss / np.log(2))) since_start = time.time() - train_start print('time since start: {:.1f} secs'.format(since_start)) train_losses.append(mean_epoch_train_loss) print('Training done; evaluating likelihood on full data:') all_losses = RunEpoch('test', model, train_data=train_data, val_data=train_data, opt=None, batch_size=1024, log_every=500, table_bits=table_bits, return_losses=True) model_nats = np.mean(all_losses) model_bits = model_nats / np.log(2) model.model_bits = model_bits if fixed_ordering is None: if seed is not None: PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}.pt'.format( args.dataset, mb, model.model_bits, table_bits, model.name(), args.epochs, seed) else: PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-{}.pt'.format( args.dataset, mb, model.model_bits, table_bits, model.name(), args.epochs, seed, time.time()) else: annot = '' if args.inv_order: annot = '-invOrder' PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-order{}{}.pt'.format( args.dataset, mb, model.model_bits, table_bits, model.name(), args.epochs, seed, '_'.join(map(str, fixed_ordering)), annot) os.makedirs(os.path.dirname(PATH), exist_ok=True) torch.save(model.state_dict(), PATH) print('Saved to:') print(PATH)
def MakeTableNew(filename): table = datasets.LoadJobTables(filename) return table, common.TableDataset(table)
def TrainTask(seed=0): import re torch.manual_seed(0) np.random.seed(0) os.environ['CUDA_VISIBLE_DEVICES'] = '1' if args.dataset == 'dmv-tiny': table = datasets.LoadDmv('dmv-tiny.csv') elif args.dataset == 'dmv': table = datasets.LoadDmv(args.version + '.csv') elif args.dataset == 'PF': table = datasets.LoadPF(args.version + '.csv') table_bits = Entropy( table, table.data.fillna(value=0).groupby([c.name for c in table.columns]).size(), [2])[0] fixed_ordering = None if args.order is not None: print('Using passed-in order:', args.order) fixed_ordering = list(args.order) # = args.order print(table.data.info()) table_train = table if update == 'yes': for f_name in os.listdir('./models'): if fnmatch.fnmatch(f_name, version + 'update_before' + '*.pt'): # filename_match all_ckpts = glob.glob('./models/' + f_name) print("Load " + f_name) s = all_ckpts[0] z = re.match('.*?model([\d\.]+)-data([\d\.]+).+seed([\d\.]+).*.pt', s) assert z model_bits = float(z.group(1)) data_bits = float(z.group(2)) seed = int(z.group(3)) bits_gap = model_bits - data_bits # print(torch.load(s)) # model.load_state_dict(torch.load(s)) args.epochs = 60 # 淇敼杩欓噷 if args.heads > 0: model = MakeTransformer(cols_to_train=table.columns, fixed_ordering=fixed_ordering, seed=seed) else: if args.dataset in ['dmv-tiny', 'dmv', 'PF']: model = MakeMade( scale=args.fc_hiddens, cols_to_train=table.columns, seed=seed, fixed_ordering=fixed_ordering, ) ReportModel(model) print('Loading ckpt:', s) model.load_state_dict(torch.load(s)) mb = ReportModel(model) if not isinstance(model, transformer.Transformer): print('Applying InitWeight()') model.apply(InitWeight) if isinstance(model, transformer.Transformer): opt = torch.optim.Adam( list(model.parameters()), 2e-4, betas=(0.9, 0.98), eps=1e-9, ) else: opt = torch.optim.Adam(list(model.parameters()), 2e-4) bs = args.bs log_every = 200 train_data = common.TableDataset(table_train) train_losses = [] train_start = time.time() timetest1 = 0 timetest2 = 0 for epoch in range(args.epochs): mean_epoch_train_loss = RunEpoch('train', model, opt, train_data=train_data, val_data=train_data, batch_size=bs, epoch_num=epoch, log_every=log_every, table_bits=table_bits) if epoch % 1 == 0: print('epoch {} train loss {:.4f} nats / {:.4f} bits'.format( epoch, mean_epoch_train_loss, mean_epoch_train_loss / np.log(2))) since_start = time.time() - train_start print('time since start: {:.1f} secs'.format(since_start)) train_losses.append(mean_epoch_train_loss) # fmetric.write('Traintime since start: {:.1f} secs'.format(since_start)+'\n') fmetric = open( '/home/jintao/naru_update/metric_result/' + args.version + '.update_naru.txt', 'a') fmetric.write('\nepoch' + str(epoch) + ':\n' + "Training Time: {}s".format(since_start - timetest2 + timetest1) + '\n') fmetric.close() all_losses = RunEpoch('test', model, train_data=train_data, val_data=train_data, opt=None, batch_size=1024, log_every=500, table_bits=table_bits, return_losses=True) model_nats = np.mean(all_losses) model_bits = model_nats / np.log(2) model.model_bits = model_bits if fixed_ordering is None: if seed is not None: PATH = 'models/' + version + 'update_after' + '{:.1f}MB-model{:.3f}-data{:.3f}-seed{}'.format( mb, model.model_bits, table_bits, seed) + '.pt' # 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}.pt'.format(args.dataset, mb, model.model_bits, table_bits, model.name(),args.epochs, seed) else: PATH = 'models/' + version + 'update_after' + '.pt' else: annot = '' if args.inv_order: annot = '-invOrder' PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-order{}{}.pt'.format( args.dataset, mb, model.model_bits, table_bits, model.name(), args.epochs, seed, '_'.join(map(str, fixed_ordering)), annot) os.makedirs(os.path.dirname(PATH), exist_ok=True) torch.save(model.state_dict(), PATH) timetest1 = time.time() os.system( 'python eval_model.py --testfilepath /home/jintao/naru_update/sql_truecard/ --version ' + version + ' --table ' + version + ' --alias ' + version + ' --dataset=PF --glob=\'<ckpt from above>\' --num-queries=500 --residual --layers=5 --fc-hiddens=256 --direct-io --column-masking --update yes --nepoch ' + str(epoch)) timetest2 = time.time() print('Training done; evaluating likelihood on full data:') '''