def MakeTable():
    print(args.csvname)
    table = datasets.LoadDmv(args.csvname)  # modify

    oracle_est = estimators_lib.Oracle(table)
    if args.run_bn:
        return table, common.TableDataset(table), oracle_est
    return table, None, oracle_est
Example #2
0
def MakeTable():
    assert args.dataset in ['dmv-tiny', 'dmv']
    if args.dataset == 'dmv-tiny':
        table = datasets.LoadDmv('dmv-tiny.csv')
    elif args.dataset == 'dmv':
        table = datasets.LoadDmv()

    oracle_est = estimators_lib.Oracle(table)
    if args.run_bn:
        return table, common.TableDataset(table), oracle_est
    return table, None, oracle_est
def MakeTable():
    if args.dataset == 'dmv-tiny':
        table = datasets.LoadDmv('dmv-tiny.csv')
    elif args.dataset == 'dmv':
        table = datasets.LoadDmv(args.version + '.csv')  # modify
    elif args.dataset == 'PF':
        table = datasets.LoadPF(args.version + '.csv')  # modify
    oracle_est = estimators_lib.Oracle(table)
    if args.run_bn:
        return table, common.TableDataset(table), oracle_est
    return table, None, oracle_est
Example #4
0
 def MakeTableDataset(self, table):
     train_data = common.TableDataset(table)
     if self.factorize:
         train_data = common.FactorizedTable(
             train_data, word_size_bits=self.word_size_bits)
     return train_data
Example #5
0
def TrainTask(seed=0):
    torch.manual_seed(0)
    np.random.seed(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'

    table = datasets.LoadDmv(args.csvname)

    table_bits = Entropy(
        table,
        table.data.fillna(value=0).groupby([c.name for c in table.columns
                                            ]).size(), [2])[0]
    fixed_ordering = None

    if args.order is not None:
        print('Using passed-in order:', args.order)
        fixed_ordering = args.order

    print(table.data.info())

    table_train = table

    if args.heads > 0:
        model = MakeTransformer(cols_to_train=table.columns,
                                fixed_ordering=fixed_ordering,
                                seed=seed)
    else:
        if args.dataset in ['dmv-tiny', 'dmv', 'forest', 'power']:
            model = MakeMade(
                scale=args.fc_hiddens,
                cols_to_train=table.columns,
                seed=seed,
                fixed_ordering=fixed_ordering,
            )
        else:
            assert False, args.dataset

    mb = ReportModel(model)

    if not isinstance(model, transformer.Transformer):
        print('Applying InitWeight()')
        model.apply(InitWeight)

    if isinstance(model, transformer.Transformer):
        opt = torch.optim.Adam(
            list(model.parameters()),
            2e-4,
            betas=(0.9, 0.98),
            eps=1e-9,
        )
    else:
        opt = torch.optim.Adam(list(model.parameters()), 2e-4)

    bs = args.bs
    log_every = 200

    train_data = common.TableDataset(table_train)

    train_losses = []
    train_start = time.time()
    for epoch in range(args.epochs):

        mean_epoch_train_loss = RunEpoch('train',
                                         model,
                                         opt,
                                         train_data=train_data,
                                         val_data=train_data,
                                         batch_size=bs,
                                         epoch_num=epoch,
                                         log_every=log_every,
                                         table_bits=table_bits)

        if epoch % 1 == 0:
            print('epoch {} train loss {:.4f} nats / {:.4f} bits'.format(
                epoch, mean_epoch_train_loss,
                mean_epoch_train_loss / np.log(2)))
            since_start = time.time() - train_start
            print('time since start: {:.1f} secs'.format(since_start))

        train_losses.append(mean_epoch_train_loss)

    print('Training done; evaluating likelihood on full data:')
    all_losses = RunEpoch('test',
                          model,
                          train_data=train_data,
                          val_data=train_data,
                          opt=None,
                          batch_size=1024,
                          log_every=500,
                          table_bits=table_bits,
                          return_losses=True)
    model_nats = np.mean(all_losses)
    model_bits = model_nats / np.log(2)
    model.model_bits = model_bits

    if fixed_ordering is None:
        if seed is not None:
            PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}.pt'.format(
                args.dataset, mb, model.model_bits, table_bits, model.name(),
                args.epochs, seed)
        else:
            PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-{}.pt'.format(
                args.dataset, mb, model.model_bits, table_bits, model.name(),
                args.epochs, seed, time.time())
    else:
        annot = ''
        if args.inv_order:
            annot = '-invOrder'

        PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-order{}{}.pt'.format(
            args.dataset, mb, model.model_bits, table_bits, model.name(),
            args.epochs, seed, '_'.join(map(str, fixed_ordering)), annot)
    os.makedirs(os.path.dirname(PATH), exist_ok=True)
    torch.save(model.state_dict(), PATH)
    print('Saved to:')
    print(PATH)
Example #6
0
def MakeTableNew(filename):
    table = datasets.LoadJobTables(filename)
    return table, common.TableDataset(table)
def TrainTask(seed=0):
    import re
    torch.manual_seed(0)
    np.random.seed(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'

    if args.dataset == 'dmv-tiny':
        table = datasets.LoadDmv('dmv-tiny.csv')
    elif args.dataset == 'dmv':
        table = datasets.LoadDmv(args.version + '.csv')
    elif args.dataset == 'PF':
        table = datasets.LoadPF(args.version + '.csv')

    table_bits = Entropy(
        table,
        table.data.fillna(value=0).groupby([c.name
                                            for c in table.columns]).size(),
        [2])[0]
    fixed_ordering = None

    if args.order is not None:
        print('Using passed-in order:', args.order)
        fixed_ordering = list(args.order)  # = args.order

    print(table.data.info())

    table_train = table

    if update == 'yes':
        for f_name in os.listdir('./models'):
            if fnmatch.fnmatch(f_name, version + 'update_before' +
                               '*.pt'):  # filename_match
                all_ckpts = glob.glob('./models/' + f_name)
                print("Load " + f_name)
        s = all_ckpts[0]
        z = re.match('.*?model([\d\.]+)-data([\d\.]+).+seed([\d\.]+).*.pt', s)
        assert z
        model_bits = float(z.group(1))
        data_bits = float(z.group(2))
        seed = int(z.group(3))
        bits_gap = model_bits - data_bits

        # print(torch.load(s))
        # model.load_state_dict(torch.load(s))
        args.epochs = 60  # 淇敼杩欓噷

    if args.heads > 0:
        model = MakeTransformer(cols_to_train=table.columns,
                                fixed_ordering=fixed_ordering,
                                seed=seed)
    else:
        if args.dataset in ['dmv-tiny', 'dmv', 'PF']:
            model = MakeMade(
                scale=args.fc_hiddens,
                cols_to_train=table.columns,
                seed=seed,
                fixed_ordering=fixed_ordering,
            )

    ReportModel(model)
    print('Loading ckpt:', s)
    model.load_state_dict(torch.load(s))

    mb = ReportModel(model)

    if not isinstance(model, transformer.Transformer):
        print('Applying InitWeight()')
        model.apply(InitWeight)

    if isinstance(model, transformer.Transformer):
        opt = torch.optim.Adam(
            list(model.parameters()),
            2e-4,
            betas=(0.9, 0.98),
            eps=1e-9,
        )
    else:
        opt = torch.optim.Adam(list(model.parameters()), 2e-4)

    bs = args.bs
    log_every = 200

    train_data = common.TableDataset(table_train)

    train_losses = []
    train_start = time.time()
    timetest1 = 0
    timetest2 = 0
    for epoch in range(args.epochs):

        mean_epoch_train_loss = RunEpoch('train',
                                         model,
                                         opt,
                                         train_data=train_data,
                                         val_data=train_data,
                                         batch_size=bs,
                                         epoch_num=epoch,
                                         log_every=log_every,
                                         table_bits=table_bits)

        if epoch % 1 == 0:
            print('epoch {} train loss {:.4f} nats / {:.4f} bits'.format(
                epoch, mean_epoch_train_loss,
                mean_epoch_train_loss / np.log(2)))
            since_start = time.time() - train_start

            print('time since start: {:.1f} secs'.format(since_start))

        train_losses.append(mean_epoch_train_loss)
        # fmetric.write('Traintime since start: {:.1f} secs'.format(since_start)+'\n')
        fmetric = open(
            '/home/jintao/naru_update/metric_result/' + args.version +
            '.update_naru.txt', 'a')
        fmetric.write('\nepoch' + str(epoch) + ':\n' +
                      "Training Time: {}s".format(since_start - timetest2 +
                                                  timetest1) + '\n')
        fmetric.close()
        all_losses = RunEpoch('test',
                              model,
                              train_data=train_data,
                              val_data=train_data,
                              opt=None,
                              batch_size=1024,
                              log_every=500,
                              table_bits=table_bits,
                              return_losses=True)
        model_nats = np.mean(all_losses)
        model_bits = model_nats / np.log(2)
        model.model_bits = model_bits
        if fixed_ordering is None:
            if seed is not None:
                PATH = 'models/' + version + 'update_after' + '{:.1f}MB-model{:.3f}-data{:.3f}-seed{}'.format(
                    mb, model.model_bits, table_bits, seed) + '.pt'
                # 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}.pt'.format(args.dataset, mb, model.model_bits, table_bits, model.name(),args.epochs, seed)
            else:
                PATH = 'models/' + version + 'update_after' + '.pt'
        else:
            annot = ''
            if args.inv_order:
                annot = '-invOrder'

            PATH = 'models/{}-{:.1f}MB-model{:.3f}-data{:.3f}-{}-{}epochs-seed{}-order{}{}.pt'.format(
                args.dataset, mb, model.model_bits, table_bits, model.name(),
                args.epochs, seed, '_'.join(map(str, fixed_ordering)), annot)
        os.makedirs(os.path.dirname(PATH), exist_ok=True)
        torch.save(model.state_dict(), PATH)
        timetest1 = time.time()
        os.system(
            'python eval_model.py --testfilepath /home/jintao/naru_update/sql_truecard/ --version '
            + version + ' --table ' + version + ' --alias ' + version +
            ' --dataset=PF --glob=\'<ckpt from above>\' --num-queries=500 --residual --layers=5 --fc-hiddens=256 --direct-io --column-masking --update yes --nepoch '
            + str(epoch))
        timetest2 = time.time()

    print('Training done; evaluating likelihood on full data:')
    '''