Exemple #1
0
    else:
        pb.set_postfix(loss=logs_train['loss'])
    # schedule lr
    # if opt.patience > 0 and score < 12:
    #     lr_scheduler.step(score)
    # lr = optimizer.param_groups[0]['lr']
    # if lr <= 1e-5:
    #     break
# ------------------------ Test ------------------------
model.eval()
with torch.no_grad():
    x_pred, _ = model.generate(opt.nt - opt.nt_train)
    score_ts = rmse(x_pred, test_data, reduce=False)
    score = rmse(x_pred, test_data)
    x_pred = x_pred.view(opt.nt - opt.nt_train, opt.nx)
    x_pred = x_pred.cpu().numpy()
    np.savetxt(os.path.join(get_dir(opt.outputdir), opt.xp, 'pred.txt'), x_pred)
logger.log('test.rmse', score)
logger.log('test.ts', {t: {'rmse': scr.item()} for t, scr in enumerate(score_ts)})
opt.test_loss = score
# logs_train['loss'] = logs_train['mse_dec'] + logs_train['loss_dyn']
opt.train_loss = logs_train['loss']

opt.end = time_dir()
end_st = datetime.datetime.now()
opt.end_time = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')
opt.time = str(end_st - start_st)
with open(os.path.join(get_dir(opt.outputdir), opt.xp, 'config.json'), 'w') as f:
    json.dump(opt, f, sort_keys=True, indent=4)
logger.save(model)
def train(command=False):
    if command == True:
        #######################################################################################################################
        # Options - CUDA - Random seed
        #######################################################################################################################
        p = configargparse.ArgParser()
        # -- data
        p.add('--datadir', type=str, help='path to dataset', default='data')
        p.add('--dataset', type=str, help='dataset name', default='ncov_confirmed')
        p.add('--nt_train', type=int, help='time for training', default=50)
        p.add('--validation_ratio', type=float, help='validation/train', default=0.1)
        p.add('--start_time', type=int, help='start time for data', default=0)
        p.add('--rescaled', type=str, help='rescaled method', default='d')
        p.add('--normalize_method', type=str, help='normalize method for relation', default='all')

        # -- xp
        p.add('--outputdir', type=str, help='path to save xp', default='default')
        p.add('--xp', type=str, help='xp name', default='stnn')
        # p.add('--dir_auto', type=boolean_string, help='dataset_model', default=True)
        p.add('--xp_auto', type=boolean_string, help='time', default=False)
        p.add('--xp_time', type=boolean_string, help='xp_time', default=True)
        p.add('--auto', type=boolean_string, help='dataset_model + time', default=False)
        # -- model
        p.add('--model', type=str, help='STNN Model', default='default')
        p.add('--mode', type=str, help='STNN mode (default|refine|discover)', default='default')
        p.add('--nz', type=int, help='laten factors size', default=1)
        p.add('--activation', type=str, help='dynamic module activation function (identity|tanh)', default='tanh')
        p.add('--khop', type=int, help='spatial depedencies order', default=1)
        p.add('--nhid', type=int, help='dynamic function hidden size', default=0)
        p.add('--nlayers', type=int, help='dynamic function num layers', default=1)
        p.add('--nhid_de', type=int, help='dynamic function hidden size', default=0)
        p.add('--nlayers_de', type=int, help='dynamic function num layers', default=1)
        p.add('--dropout_f', type=float, help='latent factors dropout', default=.5)
        p.add('--dropout_d', type=float, help='dynamic function dropout', default=.5)
        p.add('--dropout_de', type=float, help='dynamic function dropout', default=.5)
        p.add('--lambd', type=float, help='lambda between reconstruction and dynamic losses', default=.1)
        # -- optim
        p.add('--lr', type=float, help='learning rate', default=3e-3)
        p.add('--optimizer', type=str, help='learning algorithm', default='Adam')
        p.add('--beta1', type=float, default=.0, help='adam beta1')
        p.add('--beta2', type=float, default=.999, help='adam beta2')
        p.add('--eps', type=float, default=1e-9, help='adam eps')
        p.add('--wd', type=float, help='weight decay', default=1e-6)
        p.add('--wd_z', type=float, help='weight decay on latent factors', default=1e-7)
        p.add('--l2_z', type=float, help='l2 between consecutives latent factors', default=0.)
        p.add('--l1_rel', type=float, help='l1 regularization on relation discovery mode', default=0.)
        p.add('--sch_bound', type=float, help='learning rate', default=0.001)
        # -- learning
        p.add('--batch_size', type=int, default=1131, help='batch size')
        p.add('--patience', type=int, default=150, help='number of epoch to wait before trigerring lr decay')
        p.add('--nepoch', type=int, default=10, help='number of epochs to train for')
        p.add('--test', type=boolean_string, default=False, help='test during training')

        # -- gpu
        p.add('--device', type=int, default=-1, help='-1: cpu; > -1: cuda device id')
        # -- seed
        p.add('--manualSeed', type=int, help='manual seed')
        # -- logs
        p.add('--checkpoint_interval', type=int, default=100, help='check point interval')

        # parse
        opt=DotDict(vars(p.parse_args()))
        
    else:
        print('Use Matlab')
        opt = DotDict()
        # -- data
        opt.datadir = 'data'
        opt.dataset = 'ncov_confirmed'
        opt.nt_train = 15
        opt.start_time = 0
        opt.rescaled = 'd'
        opt.normalize_method = 'row'
        # -- xp
        opt.outputdir = 'default'
        opt.xp = 'stnn'
        # opt.dir_auto =  True
        opt.xp_auto =  False
        opt.xp_time =  True
        opt.auto = False
        # -- model
        opt.mode = 'default'
        opt.nz =1
        opt.activation = 'tanh'
        opt.khop = 1
        opt.nhid = 0
        opt.nlayers =1
        opt.dropout_f = .5
        opt.dropout_d = .5
        opt.lambd = .1
        # -- optim
        opt.lr = 3e-3
        opt.beta1 = .0
        opt.beta2 = .999
        opt.eps = 1e-9 
        opt.wd = 1e-6
        opt.wd_z = 1e-7
        opt.l2_z = 0.
        opt.l1_rel = 0.
        opt.sch_bound = 0.017
        # -- learning
        opt.batch_size = 1000
        opt.patience = 150
        opt.nepoch = 100
        opt.test = False
        opt.device = -1
        print(opt)

    # if opt.dir_auto:
    #     opt.outputdir = opt.dataset + "_" + opt.mode 
    if opt.outputdir == 'default':
        opt.outputdir = opt.dataset + "_" + opt.mode
    opt.outputdir = get_dir(opt.outputdir)

    if opt.xp_time:
        opt.xp = opt.xp + "_" + get_time()
    if opt.xp_auto:
        opt.xp = get_time()
    if opt.auto_all:
        opt.outputdir = opt.dataset + "_" + opt.mode 
        opt.xp = get_time()
    opt.mode = opt.mode if opt.mode in ('refine', 'discover') else None
    opt.xp = 'ori-' + opt.xp
    opt.start = time_dir()
    start_st = datetime.datetime.now()
    opt.st = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')
    # cudnn
    if opt.device > -1:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(opt.device)
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    # seed
    if opt.manualSeed is None:
        opt.manualSeed = random.randint(1, 10000)
    random.seed(opt.manualSeed)
    torch.manual_seed(opt.manualSeed)
    if opt.device > -1:
        torch.cuda.manual_seed_all(opt.manualSeed)




    #######################################################################################################################
    # Data
    #######################################################################################################################
    # -- load data

    setup, (train_data, test_data, validation_data), relations = get_stnn_data(opt.datadir, opt.dataset, opt.nt_train, opt.khop, opt.start_time, rescaled_method=opt.rescaled, normalize_method=opt.normalize_method, validation_ratio=opt.validation_ratio)
    # relations = relations[:, :, :, 0]
    train_data = train_data.to(device)
    test_data = test_data.to(device)
    relations = relations.to(device)
    for k, v in setup.items():
        opt[k] = v

    # -- train inputs
    t_idx = torch.arange(opt.nt_train, out=torch.LongTensor()).unsqueeze(1).expand(opt.nt_train, opt.nx).contiguous()
    x_idx = torch.arange(opt.nx, out=torch.LongTensor()).expand_as(t_idx).contiguous()
    # dynamic
    idx_dyn = torch.stack((t_idx[1:], x_idx[1:])).view(2, -1).to(device)
    nex_dyn = idx_dyn.size(1)
    # decoder
    idx_dec = torch.stack((t_idx, x_idx)).view(2, -1).to(device)
    nex_dec = idx_dec.size(1)

    #######################################################################################################################
    # Model
    #######################################################################################################################
    if opt.model == 'default':
        model = SaptioTemporalNN(relations, opt.nx, opt.nt_train, opt.nd, opt.nz, opt.mode, opt.nhid, opt.nlayers,
                            opt.dropout_f, opt.dropout_d, opt.activation, opt.periode).to(device)
    elif opt.model == 'GRU':
        model = SaptioTemporalNN_GRU(relations, opt.nx, opt.nt_train, opt.nd, opt.nz, opt.mode, opt.nhid, opt.nlayers,
                            opt.dropout_f, opt.dropout_d, opt.activation, opt.periode).to(device)
    elif opt.model == 'LSTM':
        model = SaptioTemporalNN_LSTM(relations, opt.nx, opt.nt_train, opt.nd, opt.nz, opt.mode, opt.nhid, opt.nlayers,
                            opt.dropout_f, opt.dropout_d, opt.activation, opt.periode).to(device)
    elif opt.model == 'ld':
        model = SaptioTemporalNN_largedecoder(relations, opt.nx, opt.nt_train, opt.nd, opt.nz, opt.mode, opt.nhid, opt.nlayers, opt.nhid_de, opt.nlayers_de, opt.dropout_de,
                            opt.dropout_f, opt.dropout_d, opt.activation, opt.periode).to(device)

    #######################################################################################################################
    # Optimizer
    #######################################################################################################################
    params = [{'params': model.factors_parameters(), 'weight_decay': opt.wd_z},
            {'params': model.dynamic.parameters()},
            {'params': model.decoder.parameters()}]
    if opt.mode in ('refine', 'discover'):
        params.append({'params': model.rel_parameters(), 'weight_decay': 0.})
        
    if opt.optimizer == 'Adam':
        optimizer = optim.Adam(params, lr=opt.lr, betas=(opt.beta1, opt.beta2), eps=opt.eps, weight_decay=opt.wd)
    elif opt.optimizer == 'SGD':
        optimizer = optim.SGD(params, lr=opt.lr, weight_decay=opt.wd)
    elif opt.optimizer == 'Rmsprop':
        optimizer = optim.RMSprop(params, lr=opt.lr, weight_decay=opt.wd)
    elif opt.optimizer == 'Adagrad':
        optimizer = optim.Adagrad(params, lr=opt.lr, weight_decay=opt.wd)

    if opt.patience > 0:
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=opt.patience)


    #######################################################################################################################
    # Logs
    #######################################################################################################################
    logger = Logger(opt.outputdir, opt.xp, opt.checkpoint_interval)
    # with open(os.path.join(opt.outputdir, opt.xp, 'config.json'), 'w') as f:
    #     json.dump(opt, f, sort_keys=True, indent=4)


    #######################################################################################################################
    # Training
    #######################################################################################################################
    lr = opt.lr
    opt.mintest = 1000.0
    if command:
        pb = trange(opt.nepoch)
    else:
        pb = range(opt.nepoch)
    for e in pb:
        # ------------------------ Train ------------------------
        model.train()
        # --- decoder ---
        idx_perm = torch.randperm(nex_dec).to(device)
        batches = idx_perm.split(opt.batch_size)
        logs_train = defaultdict(float)
        for i, batch in enumerate(batches):
            optimizer.zero_grad()
            # data
            input_t = idx_dec[0][batch]
            input_x = idx_dec[1][batch]
            x_target = train_data[input_t, input_x]
            # closure
            x_rec = model.dec_closure(input_t, input_x)
            mse_dec = F.mse_loss(x_rec, x_target)
            # backward
            mse_dec.backward()
            # step
            optimizer.step()
            # log
            # logger.log('train_iter.mse_dec', mse_dec.item())
            logs_train['mse_dec'] += mse_dec.item() * len(batch)
        # --- dynamic ---
        idx_perm = torch.randperm(nex_dyn).to(device)
        batches = idx_perm.split(opt.batch_size)
        for i, batch in enumerate(batches):
            optimizer.zero_grad()
            # data
            input_t = idx_dyn[0][batch]
            input_x = idx_dyn[1][batch]
            # closure
            z_inf = model.factors[input_t, input_x]
            z_pred = model.dyn_closure(input_t - 1, input_x)
            # loss
            mse_dyn = z_pred.sub(z_inf).pow(2).mean()
            loss_dyn = mse_dyn * opt.lambd
            if opt.l2_z > 0:
                loss_dyn += opt.l2_z * model.factors[input_t - 1, input_x].sub(model.factors[input_t, input_x]).pow(2).mean()
            if opt.mode in('refine', 'discover') and opt.l1_rel > 0:
                # rel_weights_tmp = model.rel_weights.data.clone()
                loss_dyn += opt.l1_rel * model.get_relations().abs().mean()
            # backward
            loss_dyn.backward()
            # step
            optimizer.step()
            # clip
            # if opt.mode == 'discover' and opt.l1_rel > 0:  # clip
            #     sign_changed = rel_weights_tmp.sign().ne(model.rel_weights.data.sign())
            #     model.rel_weights.data.masked_fill_(sign_changed, 0)
            # log
            # logger.log('train_iter.mse_dyn', mse_dyn.item())
            logs_train['mse_dyn'] += mse_dyn.item() * len(batch)
            logs_train['loss_dyn'] += loss_dyn.item() * len(batch)

        # --- logs ---
        # TODO:
        logs_train['mse_dec'] /= nex_dec
        logs_train['mse_dyn'] /= nex_dyn
        logs_train['loss_dyn'] /= nex_dyn
        logs_train['loss'] = logs_train['mse_dec'] + logs_train['loss_dyn']
        logger.log('train_epoch', logs_train)
        # checkpoint
        # logger.log('train_epoch.lr', lr)
        logger.checkpoint(model)
        # ------------------------ Test ------------------------
        if opt.test:
            model.eval()
            with torch.no_grad():
                x_pred, _ = model.generate(opt.validation_length)
                score = rmse(x_pred, validation_data)
            if command:
                pb.set_postfix(loss=logs_train['loss'], test=score)
            else:
                print(e, 'loss=', logs_train['loss'], 'test=', score)
            logger.log('test_epoch.rmse', score)
            if opt.mintest > score:
                opt.mintest = score
                # schedule lr
            if opt.patience > 0 and score < opt.sch_bound:
                lr_scheduler.step(score)
            lr = optimizer.param_groups[0]['lr']
            if lr <= 1e-5:
                break
        else:
            if command:
                pb.set_postfix(loss=logs_train['loss'])
            else:
                print(e, 'loss=', logs_train['loss'])
    # ------------------------ Test ------------------------
    model.eval()
    with torch.no_grad():
        x_pred, _ = model.generate(opt.nt - opt.nt_train)
        score_ts = rmse(x_pred, test_data, reduce=False)
        score = rmse(x_pred, test_data)
    # logger.log('test.rmse', score)
    # logger.log('test.ts', {t: {'rmse': scr.item()} for t, scr in enumerate(score_ts)})
    true_pred_data = torch.randn_like(x_pred)
    true_test_data = torch.randn_like(test_data)
    if opt.normalize == 'variance':
        true_pred_data = x_pred * opt.std + opt.mean
        true_test_data = test_data * opt.std + opt.mean
    if opt.normalize == 'min_max':
        true_pred_data = x_pred * (opt.max - opt.min) + opt.mean
        true_test_data = test_data * (opt.max - opt.min) + opt.mean
    true_score = rmse(true_pred_data, true_test_data)
    # print(true_pred_data)

    for i in range(opt.nd):
        d_pred =true_pred_data[:,:, i].cpu().numpy()
        # print(d_pred)
        np.savetxt(os.path.join(get_dir(opt.outputdir), opt.xp, 'true_pred_' + str(i).zfill(3) +  '.txt'), d_pred, delimiter=',')

    opt.test_loss = score
    opt.true_loss = true_score
    logs_train['loss'] = logs_train['mse_dec'] + logs_train['loss_dyn']
    opt.train_loss = logs_train['loss']
    opt.end = time_dir()
    end_st = datetime.datetime.now()
    opt.et = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')
    opt.time = str(end_st - start_st)
    with open(os.path.join(get_dir(opt.outputdir), opt.xp, 'config.json'), 'w') as f:
        json.dump(opt, f, sort_keys=True, indent=4)
    logger.save(model)