def main(n_aggregation, dim_feature, n_epochs, batch_size, eps):
    W = np.random.normal(0, 0.4, [dim_feature, dim_feature])
    A = np.random.normal(0, 0.4, dim_feature)
    b = np.array([0.])
    model = GraphNeuralNetwork(W, A, b, n_aggregation=n_aggregation)
    optimizer = Adam(model)

    dataset = util.get_train_data('../../datasets')
    train_data, valid_data = util.random_split(dataset, train_ratio=0.5)
    print('train_size: %d, valid_size: %d' %
          (len(train_data), len(valid_data)))

    for epoch in range(n_epochs):
        train_loss = util.AverageMeter()
        train_acc = util.AverageMeter()
        for graphs, labels in util.get_shuffled_batches(
                train_data, batch_size):
            grads_flat = 0
            for graph, label in zip(graphs, labels):
                x = np.zeros([len(graph), dim_feature])
                x[:, 0] = 1
                grads_flat += calc_grads(model, graph, x, label,
                                         bce_with_logit, eps) / batch_size

                outputs = model(graph, x)
                train_loss.update(bce_with_logit(outputs, label), 1)
                train_acc.update((sigmoid(outputs) > 0.5) == label, 1)

            optimizer.update(grads_flat)

        valid_loss, valid_acc = test(model, valid_data, dim_feature)
        print(
            'epoch: %d, train_loss: %f, train_acc: %f, valid_loss: %f, vald_acc: %f'
            % (epoch, train_loss.avg, train_acc.avg, valid_loss, valid_acc))
Beispiel #2
0
def train():
    graphs, labels = load_data("datasets/train")
    train_inputs, train_targets, val_inputs, val_targets = utils.split_train_val(
        graphs, labels, val_rate=0.3)

    model = GNNModel(8)
    loss_func = BinaryCrossEntropy()
    optimizer = Adam()
    batch_generator = utils.BatchGenerator(batch_size=32)

    min_loss = 100000
    for epoch in range(50):
        print(f"Epoch{epoch + 1}")

        train_losses = []
        for inputs, targets in batch_generator.generator(
                train_inputs, train_targets):
            train_loss, loss_grad = loss_func(model,
                                              inputs,
                                              targets,
                                              is_grad=True)
            optimizer.update(model, loss_grad)

            train_losses.append(train_loss)

        train_mean_loss = np.mean(train_losses)
        pred = np.array([model.predict(input_)
                         for input_ in train_inputs]).squeeze()
        train_accuracy = accuracy(pred, train_targets)

        val_losses = []
        for inputs, targets in batch_generator.generator(
                val_inputs, val_targets):
            val_loss, _ = loss_func(model, inputs, targets, is_grad=False)
            val_losses.append(val_loss)

        val_mean_loss = np.mean(val_losses)
        pred = np.array([model.predict(input_)
                         for input_ in val_inputs]).squeeze()
        val_accuracy = accuracy(pred, val_targets)

        if min(min_loss, val_mean_loss) < min_loss:
            min_loss = val_mean_loss
            print(
                f"Train loss: {train_mean_loss}\tTrain accuracy: {train_accuracy}"
            )
            print(
                f"Validation loss: {val_mean_loss}\tValidation accuracy: {val_accuracy}"
            )
            print("")
Beispiel #3
0
def get_optimizer(model, args):
    """Set up the optimizer."""

    # Build parameter groups (weight decay and non-decay).
    while isinstance(model, (DDP, FP16_Module)):
        model = model.module
    layers = model.model.bert.encoder.layer
    pooler = model.model.bert.pooler
    lmheads = model.model.cls.predictions
    nspheads = model.model.cls.seq_relationship
    embeddings = model.model.bert.embeddings
    param_groups = []
    param_groups += list(get_params_for_weight_decay_optimization(layers))
    param_groups += list(get_params_for_weight_decay_optimization(pooler))
    param_groups += list(get_params_for_weight_decay_optimization(nspheads))
    param_groups += list(get_params_for_weight_decay_optimization(embeddings))
    param_groups += list(
        get_params_for_weight_decay_optimization(lmheads.transform))
    param_groups[1]['params'].append(lmheads.bias)

    # Use Adam.
    optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay)

    # Wrap into fp16 optimizer.
    if args.fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=args.loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale,
                                   dynamic_loss_args={
                                       'scale_window': args.loss_scale_window,
                                       'min_scale': args.min_scale,
                                       'delayed_shift': args.hysteresis
                                   })

    return optimizer
Beispiel #4
0
    def __init__(self, args):
        super(PPO, self).__init__()
        # saved path
        self.saved_path = args.saved_path
        if not os.path.exists(self.saved_path):
            os.makedirs(self.saved_path)

        # neural networks
        self.actor_critic = ActorCritic(action_size=args.action_size, hidden_size=args.hidden_size,
                                        extra_hidden=args.extra_hidden, enlargement=args.enlargement,
                                        recurrent=args.recurrent, device=args.device).to(args.device)
        # args
        self.rank = args.rank
        self.device = args.device
        self.num_steps = args.num_steps
        self.num_envs = args.num_envs
        self.num_rollouts = args.num_rollouts
        self.render = args.render
        self.action_size = args.action_size

        self.update_epochs = args.update_epochs
        self.batch_size = args.batch_size
        self.clip_range = args.clip_range
        self.max_grad_norm = args.max_grad_norm
        self.gamma = args.gamma
        self.lamda = args.lamda
        self.coeff_ent = args.coeff_ent

        # optimizer
        self.optimizer = Adam(
            self.actor_critic.parameters(), args.learning_rate)

        # batch
        self.sample_envs = self.batch_size // self.num_steps
Beispiel #5
0
def main(n_aggregation, dim_feature, n_epochs, batch_size, eps, outputfile):
    W = np.random.normal(0, 0.4, [dim_feature, dim_feature])
    A = np.random.normal(0, 0.4, dim_feature)
    b = np.array([0.])
    model = GraphNeuralNetwork(W, A, b, n_aggregation=n_aggregation)
    optimizer = Adam(model)

    # Training
    train_data = util.get_train_data('../../datasets')
    print('train_size: %d' % len(train_data))
    for epoch in range(n_epochs):
        train_loss = util.AverageMeter()
        train_acc = util.AverageMeter()
        for graphs, labels in util.get_shuffled_batches(
                train_data, batch_size):
            grads_flat = 0
            for graph, label in zip(graphs, labels):
                x = np.zeros([len(graph), dim_feature])
                x[:, 0] = 1
                grads_flat += calc_grads(model, graph, x, label,
                                         bce_with_logit, eps) / batch_size

                outputs = model(graph, x)
                train_loss.update(bce_with_logit(outputs, label), 1)
                train_acc.update((sigmoid(outputs) > 0.5) == label, 1)

            optimizer.update(grads_flat)

        print('epoch: %d, train_loss: %f, train_acc: %f' %
              (epoch, train_loss.avg, train_acc.avg))

    # Prediction
    test_data = util.get_test_data('../../datasets')
    with open(outputfile, 'w') as o:
        for graph in test_data:
            x = np.zeros([len(graph), dim_feature])
            x[:, 0] = 1
            logit = model(graph, x)
            pred = sigmoid(logit) > 0.5
            o.write(str(int(pred[0])) + '\n')
Beispiel #6
0
def train(seq, dataloader, epochs=10):
    criterion = CrossEntropyLoss(seq)
    optimizer = Adam(seq)
    for epoch in range(epochs):
        epoch_loss = 0.0
        epoch_accuracy = 0.0
        n_batch = 0
        for batch, labels in dataloader:
            n_batch += 1
            outputs = seq(batch)
            loss = criterion(outputs, labels)
            accuracy = accuracy_score(outputs.argmax(axis=1), labels)

            loss.backward()
            optimizer.step()

            epoch_loss += loss
            epoch_accuracy += accuracy

        print("Epoch {}/{}   -    loss: {:%.5f}   accuracy: {:%.5f}".format(
            epoch + 1, epochs, epoch_loss / n_batch, epoch_accuracy / n_batch))

    print("Finished training !")
Beispiel #7
0
def main():
    args = parse_args()
    with open(args.input, 'r') as fp:
        data_loader = DataLoader(fp.read(), batch_size=args.seq_length)

    rnn = RNN()
    params = init_params(data_loader.vocab_size, hidden_size=args.hidden_size)
    optimizer = Adam(params, lr=args.lr)
    it = 0
    for epoch in range(args.num_epochs):
        hidden_state = np.zeros((1, args.hidden_size))
        for x, y in data_loader:
            if it % args.sample_every == 0:
                one_hot = sample(rnn, hidden_state, x[0], params,
                                 args.sample_size)
                generated_text = data_loader.decode(one_hot)
                print(generated_text)
            loss, hidden_state, dparams = rnn_training_step(
                rnn, hidden_state, x, y, params)
            if it % args.print_every == 0:
                print('iteration: {}, loss: {}'.format(it, loss))
            optimizer.step(dparams)
            it += 1
    参考:http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html
    """
    window_len = 11
    s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]]
    w = np.kaiser(window_len, 2)
    y = np.convolve(w / w.sum(), s, mode='valid')
    return y[5:len(y) - 5]


##定义几种优化optimizer
optimizers = OrderedDict()
optimizers['SGD'] = SGD()
optimizers['momentum'] = momentum()
optimizers['adagrad'] = adagrad()
optimizers['Adam'] = Adam()

##提取数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True,
                                                  one_hot_label=True)

#设置参数,初始化网络
train_size = x_train.shape[0]
batch_size = 100
iter_num = 1000

train_loss = {}
networks = {}
for key in optimizers.keys():

    networks[key] = backnet(input_size=784, hidden_size=50, output_size=10)
Beispiel #9
0
def main(args, local_rank):

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    vocabs = dict()
    vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS])
    vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS])

    if args.world_size == 1 or (dist.get_rank() == 0):
        logger.info(args)
        for name in vocabs:
            logger.info("vocab %s, size %d, coverage %.3f", name,
                        vocabs[name].size, vocabs[name].coverage)

    set_seed(19940117)

    #device = torch.device('cpu')
    torch.cuda.set_device(local_rank)
    device = torch.device('cuda', local_rank)

    if args.resume_ckpt:
        model = MatchingModel.from_pretrained(vocabs, args.resume_ckpt)
    else:
        model = MatchingModel.from_params(vocabs, args.layers, args.embed_dim,
                                          args.ff_embed_dim, args.num_heads,
                                          args.dropout, args.output_dim,
                                          args.bow)

    if args.world_size > 1:
        set_seed(19940117 + dist.get_rank())

    model = model.to(device)

    if args.resume_ckpt:
        dev_data = DataLoader(vocabs,
                              args.dev_data,
                              args.dev_batch_size,
                              addition=args.additional_negs)
        acc = validate(model, dev_data, device)
        logger.info("initialize from %s, initial acc %.2f", args.resume_ckpt,
                    acc)

    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     betas=(0.9, 0.98),
                     eps=1e-9)
    lr_schedule = get_linear_schedule_with_warmup(optimizer, args.warmup_steps,
                                                  args.total_train_steps)
    train_data = DataLoader(vocabs,
                            args.train_data,
                            args.per_gpu_train_batch_size,
                            worddrop=args.worddrop,
                            addition=args.additional_negs)
    global_step, step, epoch = 0, 0, 0
    tr_stat = Statistics()
    logger.info("start training")
    model.train()
    while global_step <= args.total_train_steps:
        for batch in train_data:
            batch = move_to_device(batch, device)
            loss, acc, bsz = model(batch['src_tokens'], batch['tgt_tokens'],
                                   args.label_smoothing)
            tr_stat.update({
                'loss': loss.item() * bsz,
                'nsamples': bsz,
                'acc': acc * bsz
            })
            tr_stat.step()
            loss.backward()

            step += 1
            if not (step % args.gradient_accumulation_steps
                    == -1 % args.gradient_accumulation_steps):
                continue

            if args.world_size > 1:
                average_gradients(model)

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_schedule.step()
            optimizer.zero_grad()
            global_step += 1

            if args.world_size == 1 or (dist.get_rank() == 0):
                if global_step % args.print_every == -1 % args.print_every:
                    logger.info("epoch %d, step %d, loss %.3f, acc %.3f",
                                epoch, global_step,
                                tr_stat['loss'] / tr_stat['nsamples'],
                                tr_stat['acc'] / tr_stat['nsamples'])
                    tr_stat = Statistics()
                if global_step > args.warmup_steps and global_step % args.eval_every == -1 % args.eval_every:
                    dev_data = DataLoader(vocabs,
                                          args.dev_data,
                                          args.dev_batch_size,
                                          addition=args.additional_negs)
                    acc = validate(model, dev_data, device)
                    logger.info("epoch %d, step %d, dev, dev acc %.2f", epoch,
                                global_step, acc)
                    save_path = '%s/epoch%d_batch%d_acc%.2f' % (
                        args.ckpt, epoch, global_step, acc)
                    model.save(args, save_path)
                    model.train()
            if global_step > args.total_train_steps:
                break
        epoch += 1
    logger.info('rank %d, finish training after %d steps', local_rank,
                global_step)
Beispiel #10
0
def main(args, local_rank):

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    vocabs = dict()
    vocabs['src'] = Vocab(args.src_vocab, 0, [BOS, EOS])
    vocabs['tgt'] = Vocab(args.tgt_vocab, 0, [BOS, EOS])

    if args.world_size == 1 or (dist.get_rank() == 0):
        logger.info(args)
        for name in vocabs:
            logger.info("vocab %s, size %d, coverage %.3f", name,
                        vocabs[name].size, vocabs[name].coverage)

    set_seed(19940117)

    #device = torch.device('cpu')
    torch.cuda.set_device(local_rank)
    device = torch.device('cuda', local_rank)

    if args.arch == 'vanilla':
        model = Generator(vocabs, args.embed_dim, args.ff_embed_dim,
                          args.num_heads, args.dropout, args.enc_layers,
                          args.dec_layers, args.label_smoothing)
    elif args.arch == 'mem':
        model = MemGenerator(vocabs, args.embed_dim, args.ff_embed_dim,
                             args.num_heads, args.dropout, args.mem_dropout,
                             args.enc_layers, args.dec_layers,
                             args.mem_enc_layers, args.label_smoothing,
                             args.use_mem_score)
    elif args.arch == 'rg':
        logger.info("start building model")
        logger.info("building retriever")
        retriever = Retriever.from_pretrained(
            args.num_retriever_heads,
            vocabs,
            args.retriever,
            args.nprobe,
            args.topk,
            local_rank,
            use_response_encoder=(args.rebuild_every > 0))

        logger.info("building retriever + generator")
        model = RetrieverGenerator(vocabs, retriever, args.share_encoder,
                                   args.embed_dim, args.ff_embed_dim,
                                   args.num_heads, args.dropout,
                                   args.mem_dropout, args.enc_layers,
                                   args.dec_layers, args.mem_enc_layers,
                                   args.label_smoothing)

    if args.resume_ckpt:
        model.load_state_dict(torch.load(args.resume_ckpt)['model'])
    else:
        global_step = 0

    if args.world_size > 1:
        set_seed(19940117 + dist.get_rank())

    model = model.to(device)

    retriever_params = [
        v for k, v in model.named_parameters() if k.startswith('retriever.')
    ]
    other_params = [
        v for k, v in model.named_parameters()
        if not k.startswith('retriever.')
    ]

    optimizer = Adam([{
        'params': retriever_params,
        'lr': args.embed_dim**-0.5 * 0.1
    }, {
        'params': other_params,
        'lr': args.embed_dim**-0.5
    }],
                     betas=(0.9, 0.98),
                     eps=1e-9)
    lr_schedule = get_inverse_sqrt_schedule_with_warmup(
        optimizer, args.warmup_steps, args.total_train_steps)
    train_data = DataLoader(vocabs,
                            args.train_data,
                            args.per_gpu_train_batch_size,
                            for_train=True,
                            rank=local_rank,
                            num_replica=args.world_size)

    model.eval()
    #dev_data = DataLoader(vocabs, cur_dev_data, args.dev_batch_size, for_train=False)
    #bleu = validate(device, model, dev_data, beam_size=5, alpha=0.6, max_time_step=10)

    step, epoch = 0, 0
    tr_stat = Statistics()
    logger.info("start training")
    model.train()

    best_dev_bleu = 0.
    while global_step <= args.total_train_steps:
        for batch in train_data:
            #step_start = time.time()
            batch = move_to_device(batch, device)
            if args.arch == 'rg':
                loss, acc = model(
                    batch,
                    update_mem_bias=(global_step >
                                     args.update_retriever_after))
            else:
                loss, acc = model(batch)

            tr_stat.update({
                'loss': loss.item() * batch['tgt_num_tokens'],
                'tokens': batch['tgt_num_tokens'],
                'acc': acc
            })
            tr_stat.step()
            loss.backward()
            #step_cost = time.time() - step_start
            #print ('step_cost', step_cost)
            step += 1
            if not (step % args.gradient_accumulation_steps
                    == -1 % args.gradient_accumulation_steps):
                continue

            if args.world_size > 1:
                average_gradients(model)

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_schedule.step()
            optimizer.zero_grad()
            global_step += 1

            if args.world_size == 1 or (dist.get_rank() == 0):
                if global_step % args.print_every == -1 % args.print_every:
                    logger.info("epoch %d, step %d, loss %.3f, acc %.3f",
                                epoch, global_step,
                                tr_stat['loss'] / tr_stat['tokens'],
                                tr_stat['acc'] / tr_stat['tokens'])
                    tr_stat = Statistics()
                if global_step % args.eval_every == -1 % args.eval_every:
                    model.eval()
                    max_time_step = 256 if global_step > 2 * args.warmup_steps else 5
                    bleus = []
                    for cur_dev_data in args.dev_data:
                        dev_data = DataLoader(vocabs,
                                              cur_dev_data,
                                              args.dev_batch_size,
                                              for_train=False)
                        bleu = validate(device,
                                        model,
                                        dev_data,
                                        beam_size=5,
                                        alpha=0.6,
                                        max_time_step=max_time_step)
                        bleus.append(bleu)
                    bleu = sum(bleus) / len(bleus)
                    logger.info("epoch %d, step %d, dev bleu %.2f", epoch,
                                global_step, bleu)
                    if bleu > best_dev_bleu:
                        testbleus = []
                        for cur_test_data in args.test_data:
                            test_data = DataLoader(vocabs,
                                                   cur_test_data,
                                                   args.dev_batch_size,
                                                   for_train=False)
                            testbleu = validate(device,
                                                model,
                                                test_data,
                                                beam_size=5,
                                                alpha=0.6,
                                                max_time_step=max_time_step)
                            testbleus.append(testbleu)
                        testbleu = sum(testbleus) / len(testbleus)
                        logger.info("epoch %d, step %d, test bleu %.2f", epoch,
                                    global_step, testbleu)
                        torch.save({
                            'args': args,
                            'model': model.state_dict()
                        }, '%s/best.pt' % (args.ckpt, ))
                        if not args.only_save_best:
                            torch.save(
                                {
                                    'args': args,
                                    'model': model.state_dict()
                                },
                                '%s/epoch%d_batch%d_devbleu%.2f_testbleu%.2f' %
                                (args.ckpt, epoch, global_step, bleu,
                                 testbleu))
                        best_dev_bleu = bleu
                    model.train()

            if args.rebuild_every > 0 and (global_step % args.rebuild_every
                                           == -1 % args.rebuild_every):
                model.retriever.drop_index()
                torch.cuda.empty_cache()
                next_index_dir = '%s/batch%d' % (args.ckpt, global_step)
                if args.world_size == 1 or (dist.get_rank() == 0):
                    model.retriever.rebuild_index(next_index_dir)
                    dist.barrier()
                else:
                    dist.barrier()
                model.retriever.update_index(next_index_dir, args.nprobe)

            if global_step > args.total_train_steps:
                break
        epoch += 1
    logger.info('rank %d, finish training after %d steps', local_rank,
                global_step)
Beispiel #11
0
##初始值给定,
init_pos=(-7.0,2.0)
params={}
params['x']=init_pos[0]
params['y']=init_pos[1]

grads={}
grads['x']=0
grads['y']=0

optimizers=OrderedDict()##有序字典
optimizers['SGD']=SGD(lr=0.95)
optimizers['momentum']=momentum(lr=0.1)
optimizers['adagrad']=adagrad(lr=1.5)
optimizers['Adam']= Adam(lr=0.3)

idx=1##图的位置分布
for key in optimizers.keys():##取每个key键值
        ##尝试每种优化方法
        ##初始化参数
    optimizer=optimizers[key]
    params['x']=init_pos[0]
    params['y']=init_pos[1]
    x_history=[]
    y_history=[]
    
    ##定义梯度的来源以及梯度下降过程
   
    for i in range(30):
        x_history.append(params['x'])
Beispiel #12
0
def foo(mod, op, d):
    if (op[0] == "linear"):
        xx = Linear(d)

    # rnncell, lstmcell, grucell
    elif (mod[0] in ["LSTMCell", "GRUCell"]) and (op[0] == "forward"):
        xx = RNNCell(d)

    elif op[0] in [
            "conv1d",
            "conv2d",
    ]:
        xx = Conv(d)

    elif (op[0] in Pointwise.ops):
        xx = Pointwise(d)

    elif (op[0] in Convert.ops):
        xx = Convert(d)

    elif op[0] in ["__matmul__", "matmul"]:
        xx = Matmul(d)

    elif op[0] == "embedding":
        xx = Embedding(d)

    #reduction
    elif op[0] == "sum":
        xx = Sum(d)

    elif op[0] == "mean":
        xx = Mean(d)

    elif op[0] == "norm":
        xx = Norm(d)

    elif op[0] == "dropout":
        xx = Dropout(d)

    #Index, Slice, Join, Mutate
    elif (op[0] == "cat"):
        xx = Cat(d)

    elif (op[0] == "reshape"):
        xx = Reshape(d)

    elif (op[0] == "masked_scatter_"):
        xx = MaskedScatter(d)

    elif (op[0] == "gather"):
        xx = Gather(d)

    elif (op[0] == "nonzero"):
        xx = Nonzero(d)

    elif (op[0] == "index_select"):
        xx = IndexSelect(d)

    elif (op[0] == "masked_select"):
        xx = MaskedSelect(d)

    #blas
    elif op[0] in ["addmm", "addmm_"]:
        xx = Addmm(d)

    elif op[0] == "mm":
        xx = Mm(d)

    elif op[0] == "bmm":
        xx = Bmm(d)

    #softmax
    elif op[0] == "softmax":
        xx = Softmax(d)

    elif op[0] == "log_softmax":
        xx = LogSoftmax(d)

    #loss
    elif op[0] == "mse_loss":
        xx = MSELoss(d)

    #optimizers
    elif op[0] == "adam":
        xx = Adam(d)

    #normalization
    elif op[0] == "batch_norm":
        xx = BatchNorm(d)

    #random
    elif op[0] == "randperm":
        xx = RandPerm(d)

    #misc
    elif op[0] == "copy_":
        xx = Copy(d)

    elif op[0] == "clone":
        xx = Clone(d)

    elif op[0] == "contiguous":
        xx = Contiguous(d)

    elif op[0] == "any":
        xx = Any(d)

    elif (op[0] in Activation.ops):
        xx = Activation(d)

    elif op[0] == "to":
        xx = Convert(d)

    else:
        xx = Foo(d)

    return xx
         scores, deprocess(targets, n_bits))
     # multinomial sampling needs to be processed to [-1,1] at generation
     generate_fn = partial(pixelcnn.generate_fn,
                           preprocess_fn=preprocess,
                           n_bits=args.n_bits)
     optimizer = RMSprop(model.parameters(), lr=args.lr, polyak=args.polyak)
     scheduler = None
 elif args.model == 'pixelcnnpp':
     from generative_models_toolbox.algos.pixelcnn import pixelcnnpp
     model = pixelcnnpp.PixelCNNpp(args.image_dims, args.n_channels,
                                   args.n_res_layers, args.n_logistic_mix,
                                   args.n_cond_classes).to(args.device)
     loss_fn = pixelcnnpp.loss_fn
     generate_fn = pixelcnnpp.generate_fn
     optimizer = Adam(model.parameters(),
                      lr=args.lr,
                      betas=(0.95, 0.9995),
                      polyak=args.polyak)
     scheduler = torch.optim.lr_scheduler.ExponentialLR(
         optimizer, args.lr_decay)
 elif args.model == 'pixelsnail':
     from generative_models_toolbox.algos.pixelcnn import pixelsnail, pixelcnnpp
     model = pixelsnail.PixelSNAIL(args.image_dims, args.n_channels,
                                   args.n_res_layers, args.attn_n_layers,
                                   args.attn_nh, args.attn_dq, args.attn_dv,
                                   args.attn_drop_rate, args.n_logistic_mix,
                                   args.n_cond_classes).to(args.device)
     loss_fn = pixelcnnpp.loss_fn
     generate_fn = pixelcnnpp.generate_fn
     optimizer = Adam(model.parameters(),
                      lr=args.lr,
                      betas=(0.95, 0.9995),
Beispiel #14
0
images = np.asarray(images)


img_viewer_examples(images, labels.tolist()[0], greyscale= True)


model = Model()
model.add(Dense(784, 90))
model.add(ReLU())
model.add(Dense(90, 45))
model.add(ReLU())
model.add(Dense(45, 10))


model.set_loss(CrossEntropyLoss())
optimizer = Adam(model.parameters(), learning_rate = 0.01)
lr_schedular = StepLR(optimizer, step_size = 1, gamma=0.1)


# weights path
path = "./checkpoints/Linear_MINST_weights.sav"
# model = load_weights(path)

epochs = 6
for epoch in range(epochs):
    i = 0
    for image, label in dataloader:
        if epoch == 5:
            model.graph()
        image = image/255
        i = i + 1
def run(hparams,
        model,
        train_dataloader,
        valid_dataloader,
        device,
        out_dir='checkpoints'):
    learning_rate = hparams['learning_rate']
    accumulate_step = hparams['accumulate_step']
    lr_schedule = hparams['lr_schedule']
    warmup_steps = hparams['warmup_steps']
    warmup_proportion = hparams['warmup_proportion']
    n_embd = hparams['n_embd']
    num_optim_steps = hparams['num_optim_steps']
    train_batch_size = hparams['train_batch_size']
    valid_step = hparams['valid_step']
    no_token_id = hparams['no_token_id']

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    total_params = sum([np.prod(p.size()) for p in model_parameters])
    logger.info('Number of parameter = {}'.format(total_params))

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'ln']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = Adam(optimizer_grouped_parameters,
                     learning_rate,
                     max_grad_norm=1.0)

    step = 0
    global_step = 0
    epoch = 0

    while True:
        model.train()
        (tr_loss, tr_ppl, mean_ppl, nb_tr_examples,
         nb_tr_steps) = 0.0, 0.0, 0.0, 0, 0
        n_token_real, n_token_total = 0, 0
        pbar = tqdm.tqdm(enumerate(train_dataloader),
                         total=len(train_dataloader))

        for i, batch in pbar:
            batch = tuple(t.cuda() for t in batch)
            input_ids, position_ids, token_type_ids, label_ids, *_ = batch
            if no_token_id:
                token_type_ids = None
            loss, ppl = model(input_ids, position_ids, token_type_ids,
                              label_ids)
            loss = loss.mean()
            loss = loss / (train_batch_size / input_ids.shape[0])
            loss.backward()
            nb_tr_steps += 1
            tr_loss += float(
                loss.sum().item()) * (train_batch_size / input_ids.shape[0])

            if ppl.sum().item() < 1000000:
                tr_ppl += ppl.sum().item()
            else:
                tr_ppl += mean_ppl

            mean_loss = tr_loss / nb_tr_steps
            mean_ppl = tr_ppl / nb_tr_steps

            n_token_total += input_ids.shape[0] * input_ids.shape[1]
            n_token_real += (input_ids != 0).sum().item()

            #gradient update
            step += 1
            if step % accumulate_step == 0:
                set_lr(optimizer, global_step, lr_schedule, learning_rate,
                       warmup_steps, warmup_proportion, n_embd,
                       num_optim_steps)
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                print(
                    'epoch: {}, global_step: {}, step: {}, mean_loss: {}, mean_ppl:{}'
                    .format(epoch + 1, global_step + 1, step + 1, mean_loss,
                            mean_ppl),
                    file=train_logger)

                if global_step % valid_step == 0:
                    print('Saving model...')
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'epoch': epoch,
                            'hparams': hparams,
                        },
                        os.path.join(out_dir,
                                     f'GPT2-pretrain-step-{global_step}.pkl'))
                    eval_loss, eval_ppl = valid(model, valid_dataloader, epoch,
                                                device)
                    print('{},{},{},{},{}'.format(epoch + 1, global_step + 1,
                                                  step + 1, eval_loss,
                                                  eval_ppl),
                          file=valid_logger)
                    logger.info('current learning rate: ' +
                                str(optimizer.param_groups[0]['lr']))
                    model.train()
                if global_step >= num_optim_steps:
                    break
            if (step + 1) % CACHE_EMPTY_STEP == 0:
                torch.cuda.empty_cache()
        if global_step >= num_optim_steps:
            break
        epoch += 1
    train_logger.close()
    valid_logger.close()
Beispiel #16
0
            "Please install apex from https://www.github.com/nvidia/apex "
            "to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          bias_correction=False,
                          max_grad_norm=1.0)
    if args.loss_scale == 0:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True,
                                   verbose=False)
    else:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=args.loss_scale,
                                   verbose=False)
else:
    optimizer = Adam(optimizer_grouped_parameters, args.learning_rate,
                     max_grad_norm=1.0)

#########################################################################
# Training !
##########################################################################

if args.local_rank == -1 or get_rank() == 0:

    with open(join(log_dir, 'train_log.txt'), 'a+', buffering=1) as train_logger:
        print('epoch,global_step,step,mean_loss,mean_ppl,n_token_real,'
              'n_token_total,epoch_time', file=train_logger)
    with open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1) as eval_logger:
        print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger)

global_step = 0
step = 0
Beispiel #17
0
        return ll
    
    def likelihood(x,y):
        out = model(x)
        #return -1*((y - out)**2).sum(1)
        return log_normal(y,out,zero+np.log(9)).sum(1)
    
    def lossf(x,y):
        ll = likelihood(x,y).sum() + prior() + bnn.params.merged_sampler.logdet 
        return -ll/float(n)
    
    

    
    L = 32
    adam = Adam(bnn.params.parameters(), 0.001)
    


    
    T = 2500
    x1, x2 = -6, 6
    y1, y2 = -100, 100
    for i in range(T):
        
        adam.zero_grad()
        bnn.params.sample()
        loss = lossf(X_,Y_)
        loss.backward()
        adam.step()