Example #1
0
def create_post(node):
    operation = request.GET.get("operation")
    uci_model = client.get_uci_config()
    parent = uci_model.find_child(node)
    if isinstance(parent, uci_raw.Section):
        if operation == "add-list":
            form = UciRawForm(uci_raw.List, editable_key=True)
            if form.validates(request.POST):
                new_element = form.to_model()
        elif operation == "add-option":
            form = UciRawForm(uci_raw.Option, editable_key=True)
            if form.validates(request.POST):
                new_element = form.to_model()
        else:
            raise ValueError(
                "Requested operation not allowed for Section node.")
    elif isinstance(parent, uci_raw.Config):
        form = UciRawForm(uci_raw.Section, editable_key=True)(request.POST)
        if form.validates(request.POST):
            new_element = form.to_model()
    elif isinstance(parent, uci_raw.List):
        form = UciRawForm(uci_raw.Value, editable_key=True)(request.POST)
        if form.validates(request.POST):
            new_element = form.to_model()
    else:
        raise ValueError("New node cannot be created here.")

    if not form.valid:
        return dict(node_path=node, form=form)

    new_element.operation = "create"
    parent.add(new_element)
    print_model(new_element)
    edit_uci_config(new_element)
    bottle.redirect(reverse("uci_index"))
Example #2
0
File: uci.py Project: jtojnar/foris
def create_post(node):
    operation = request.GET.get("operation")
    uci_model = client.get_uci_config()
    parent = uci_model.find_child(node)
    if isinstance(parent, uci_raw.Section):
        if operation == "add-list":
            form = UciRawForm(uci_raw.List, editable_key=True)
            if form.validates(request.POST):
                new_element = form.to_model()
        elif operation == "add-option":
            form = UciRawForm(uci_raw.Option, editable_key=True)
            if form.validates(request.POST):
                new_element = form.to_model()
        else:
            raise ValueError("Requested operation not allowed for Section node.")
    elif isinstance(parent, uci_raw.Config):
        form = UciRawForm(uci_raw.Section, editable_key=True)(request.POST)
        if form.validates(request.POST):
            new_element = form.to_model()
    elif isinstance(parent, uci_raw.List):
        form = UciRawForm(uci_raw.Value, editable_key=True)(request.POST)
        if form.validates(request.POST):
            new_element = form.to_model()
    else:
        raise ValueError("New node cannot be created here.")

    if not form.valid:
        return dict(node_path=node, form=form)

    new_element.operation = "create"
    parent.add(new_element)
    print_model(new_element)
    edit_uci_config(new_element)
    bottle.redirect(reverse("uci_index"))
Example #3
0
def mnist_utilizando_cnn_simples():
    (X_train, y_train), (X_test, y_test) = load_mnist_dataset('mnist.npz')
    # transformar para o formato [instancias][pixeis][largura][altura]
    X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32')
    X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32')
    # normalizar os valores dos pixeis de 0-255 para 0-1
    X_train = X_train / 255
    X_test = X_test / 255
    # transformar o label que é um inteiro em categorias binárias, o valor passa a ser o correspondente à posição
    # o 5 passa a ser a lista [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)
    num_classes = y_test.shape[1]
    # definir a topologia da rede e compilar
    model = create_compile_model_cnn_simples(num_classes)
    utils.print_model(model, "model_simples.png")
    # treinar a rede
    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        epochs=10,
                        batch_size=100,
                        verbose=2)
    #print_history_accuracy(history)
    utils.print_history_loss(history)
    # Avaliação final com os casos de teste
    scores = model.evaluate(X_test, y_test, verbose=0)
    print('Scores: ', scores)
    print("Erro modelo MLP: %.2f%%" % (100 - scores[1] * 100))
def ciclo_completo():
    (input_attributes,
     output_attributes) = read_cvs_dataset("pima-indians-diabetes.csv", 8)
    model = create_model()
    utils.print_model(model, "model_MLP.png")
    compile_model(model)
    history = fit_model(model, input_attributes, output_attributes)
    utils.print_history_loss(history)
    model_evaluate(model, input_attributes, output_attributes)
    model_print_predictions(model, input_attributes, output_attributes)
Example #5
0
def train(model,
          v_emb,
          q_emb,
          groundtruth,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    lr_default = 1e-3 * 0.5
    lr_decay_step = 2
    lr_decay_rate = .25
    lr_decay_epochs = range(10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))
    v_emb = v_emb.cuda()
    q_emb = q_emb.cuda()

    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = 0

        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        gw = model(v_emb, q_emb)

        loss = gw
        print(loss)
        loss.backward()
        optim.step()
        optim.zero_grad()
def ciclo_ler_dataset_treinar_gravar():
    (input_attributes,
     output_attributes) = read_cvs_dataset("pima-indians-diabetes.csv", 8)
    model = create_model()
    utils.print_model(model, "model2.png")
    compile_model(model)
    history = fit_model(model, input_attributes, output_attributes)
    utils.print_history_accuracy(history)
    utils.print_history_loss(history)
    model_evaluate(model, input_attributes, output_attributes)
    utils.save_model_json(model, "model.json")
    utils.save_weights_hdf5(model, "model.h5")
    return (input_attributes, output_attributes)
Example #7
0
def train_foil(model, train_loader, eval_loader, num_epochs, output, lr):
    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=lr)
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_accuracy = 0
    utils.print_model(model, logger)
    logger.write('optim: adam lr=%.4f' % lr)
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch))
        total_loss = 0
        train_score = 0
        t = time.time()
        N = len(train_loader.dataset)
        bar = progressbar.ProgressBar(max_value=N)
        idx = 0
        for i, (v, b, q, a) in enumerate(train_loader):
            model.train(True)
            bar.update(idx)
            batch_size = v.size(0)
            v = Variable(v).cuda()
            b = Variable(b).cuda()
            q = Variable(q).cuda()
            a = Variable(a).cuda()
            idx += batch_size
            pred, att = model(v, b, q, a)
            loss = instance_bce_with_logits(pred, a)
            optim.zero_grad()
            loss.backward()
            optim.step()
            batch_score = compute_accuracy_with_logits(pred, a.data)
            total_loss += loss.data[0] * v.size(0)
            train_score += batch_score
        bar.update(idx)
        total_loss /= N
        train_score = 100 * train_score / N
        if eval_loader is not None:
            model.train(False)
            eval_score = evaluate_foil(model, eval_loader)
            model.train(True)
        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, score: %.2f' %
                     (total_loss, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f' % (100 * eval_score))
        if eval_loader is not None and eval_score > best_eval_accuracy:
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_accuracy = eval_score
Example #8
0
def main_quantified_TransE():
    # Define some hyper-parameters for training
    emb_dim = 100
    lr = 0.0004
    margin = 0.5
    n_epochs = 1000
    batch_size = 2097152

    # Load dataset
    data_path = "/tmp/pycharm_project_583/data/uncmtrd/agg6_202005_ALL_tv.csv"

    kg_train, kg_val, kg_test = load_custom_qr(data_path=data_path)

    model = TransEQuantifiedRelations(
        emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type="L2"
    )

    print_model(model) # check we only have two embedding layers - one for entity, the other for relations

    dataset_name = data_path.split('/')[-1].replace('.csv', '')
    curr_time = datetime.now().strftime('%Y%m%d%H%M%S')
    model_prefix = os.path.join('./pretrained', f'{dataset_name}_emb{emb_dim}_lr{lr}_mgn{margin}_epch{n_epochs}_bsize{batch_size}_t{curr_time}')

    criterion = MarginLoss(margin)
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    trainer = Trainer(
        model,
        criterion,
        kg_train,
        n_epochs,
        batch_size,
        optimizer=optimizer,
        sampling_type="bern",
        use_cuda=None,
    )

    trainer.run(kg_test=kg_test, model_prefix=model_prefix)
def LSTM_sales_data(normalizer=None):
	df, scaler = get_data(normalizer=normalizer)
	print("Dataset: ", df.shape)

	janela = 6 #tamanho da Janela deslizante (trimestral, mensal, semestral)
	X_train, y_train, X_test, y_test = split_data(df, janela)
	print("X_train", X_train.shape)
	print("y_train", y_train.shape)
	print("X_test", X_test.shape)
	print("y_test", y_test.shape)

	model = build_model(janela)

	model.fit(X_train, y_train, batch_size=10, epochs=300, validation_split=0.1, verbose=1) #validation 0.1 dos 0.66 usados para treino
	
	utils.print_model(model,"lstm_model.png")

	trainScore = model.evaluate(X_train, y_train, verbose=0)
	print('\n Train Score: %.2f MSE (%.2f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))
	testScore = model.evaluate(X_test, y_test, verbose=0)
	print(' Test Score: %.2f MSE (%.2f RMSE)' % (testScore[0], math.sqrt(testScore[0])))
	print('\n****************** UNSCALED*******************')

	# Unscale Results to get real value predictions and error
	trainScore = trainScore[0].reshape(-1, 1).astype('float32')
	unscaled_Train = scaler.inverse_transform(trainScore)
	print('\n Unscaled Train Score: %.2f MSE (%.2f RMSE)' % (unscaled_Train, math.sqrt(unscaled_Train)))
	testScore = testScore[0].reshape(-1, 1).astype('float32')
	unscaled_Test = scaler.inverse_transform(testScore)
	print(' Unscaled Test Score: %.2f MSE (%.2f RMSE) \n' % (unscaled_Test, math.sqrt(unscaled_Test)))

	p = model.predict(X_test)
	predic = np.squeeze(np.asarray(p)) #para transformar uma matriz de uma coluna e n linhas em	um np array de n elementos
	
	print_series_prediction(y_test,predic)
	print('')
	print_series_prediction(y_test,predic, normalizer=scaler)
Example #10
0
def mnist_utilizando_mlp():
    (X_train, y_train), (X_test, y_test) = load_mnist_dataset('mnist.npz')

    # transformar a matriz 28*28 das imagens num vector com 784 atributos para cada imagem (porque é multilayer-perceptron)
    num_pixels = X_train.shape[1] * X_train.shape[2]
    X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
    X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')

    # normalizar os valores dos pixeis de 0-255 para 0-1
    X_train = X_train / 255
    X_test = X_test / 255

    # transformar o label que é um inteiro em categorias binárias, o valor passa a ser o correspondente à posição
    # o 5 passa a ser a lista [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)
    num_classes = y_test.shape[1]

    # definir a topologia da rede e compilar
    model = create_compile_model_mlp(num_pixels, num_classes)
    utils.print_model(model, "model.png")

    # treinar a rede
    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        epochs=10,
                        batch_size=50,
                        verbose=2)
    #utils.print_history_accuracy(history)
    utils.print_history_loss(history)

    # Avaliação final com os casos de teste
    scores = model.evaluate(X_test, y_test, verbose=0)
    print('Scores: ', scores)
    print("Erro modelo MLP: %.2f%%" % (100 - scores[1] * 100))
Example #11
0
File: uci.py Project: jtojnar/foris
def debug(node):
    uci_model = client.get_uci_config()
    node_model = uci_model.find_child(node)
    return "<pre>%s</pre>" % websafe(print_model(node_model))
Example #12
0
def train(model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0,
          logger=None,
          save_one_ckpt=True):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = .25
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25
    dset = train_loader.dataset

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    if logger is None:
        logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    model_path = os.path.join(output, 'model_epoch-1.pth')

    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        train_zcore = 0
        total_norm = 0
        count_norm = 0
        n_answer_type = torch.zeros(len(dset.idx2type))
        score_answer_type = torch.zeros(len(dset.idx2type))
        t = time.time()
        N = len(train_loader.dataset)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        for i, (v, b, q, a, c, at) in enumerate(train_loader):
            v = v.cuda()
            b = b.cuda()
            q = q.cuda()
            a = a.cuda()
            c = c.cuda().unsqueeze(-1).float()
            at = at.cuda()
            answer_type = torch.zeros(v.size(0), len(dset.idx2type)).cuda()
            answer_type.scatter_(1, at.unsqueeze(1), 1)

            pred, conf, att = model(v, b, q, a, c)
            loss = instance_bce_with_logits(pred, a)
            loss.backward(retain_graph=True)
            losz = instance_bce_with_logits(conf, c)
            losz.backward()
            total_norm += nn.utils.clip_grad_norm_(model.parameters(),
                                                   grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()

            batch_score = compute_score_with_logits(pred, a.data)
            type_score = batch_score.sum(-1, keepdim=True) * answer_type
            batch_score = batch_score.sum()

            total_loss += loss.item() * v.size(0)
            train_score += batch_score.item()

            batch_zcore = compute_zcore_with_logits(conf, c.data).sum()
            train_zcore += batch_zcore.item()

            n_answer_type += answer_type.sum(0).cpu()
            score_answer_type += type_score.sum(0).cpu()

        total_loss /= N
        train_score = 100 * train_score / N
        train_zcore = 100 * train_zcore / N
        if None != eval_loader:
            model.train(False)
            eval_score, eval_zcore, bound, entropy, val_n_answer_type, val_score_answer_type = evaluate(
                model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write(
            '\ttrain_loss: %.2f, norm: %.4f, score: %.2f, confidence: %.2f' %
            (total_loss, total_norm / count_norm, train_score, train_zcore))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' %
                         (100 * eval_score, 100 * bound))
            logger.write('\tconfidence: %.2f (%.2f)' % (100 * eval_zcore, 100))

        if eval_loader is not None and entropy is not None:
            info = ''
            for i in range(entropy.size(0)):
                info = info + ' %.2f' % entropy[i]
            logger.write('\tentropy: ' + info)

        if (eval_loader is not None and eval_score > best_eval_score) or (
                eval_loader is None and epoch >= saving_epoch):
            if save_one_ckpt and os.path.exists(model_path):
                os.remove(model_path)
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            best_type = val_score_answer_type
            if eval_loader is not None:
                best_eval_score = eval_score
    return best_eval_score, bound, n_answer_type, val_n_answer_type, score_answer_type / n_answer_type, best_type / val_n_answer_type
Example #13
0
def train(model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = 1
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    woman = 0
    woman_true = 0
    woman_man = 0
    woman_other = 0
    man = 0
    man_true = 0
    man_woman = 0
    man_other = 0
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = len(train_loader.dataset)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] = 1e-3
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        import pickle as pkl
        from PIL import Image, ImageDraw
        lab2ans = pkl.load(open("./data/cache/trainval_label2ans.pkl", 'rb'))
        '''
        for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader):
          
            v = v.cuda()
            b = b.cuda()
            q = q.cuda()
            a = a.cuda()

            visual_pred, att = model(v, b, q, a)
           
      
            loss = instance_bce_with_logits(visual_pred, a)
            loss.backward()
            
            total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()

            batch_score = compute_score_with_logits(visual_pred, a.data).sum()
            total_loss += loss.item() * v.size(0)
            train_score += batch_score.item()
            
        '''
        total_loss /= N
        train_score = 100 * train_score / N

        if None != eval_loader:
            model.train(False)
            eval_score, bound, _ = evaluate(model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))

        logger.write('\teval score: %.2f (%.2f)' %
                     (100 * eval_score, 100 * bound))

        if (eval_loader is not None and eval_score > best_eval_score) or (
                eval_loader is None and epoch >= saving_epoch):
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #14
0
def train(model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = 1
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = len(train_loader.dataset)
        print(N)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] = 1e-3
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])
        '''
        for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader):
            
            v = v.cuda()
            b = b.cuda()
            q = q.cuda()
            a = a.cuda()

            visual_pred, att = model(v, b, q, a)
            
            #import pdb;pdb.set_trace()
            gender=gender.squeeze(1)
            weights=torch.Tensor([2.0,1.0,0.0001]).cuda()
            #loss = instance_bce_with_logits(visual_pred, g.cuda())
            loss=nn.CrossEntropyLoss(weights)
            loss=loss(visual_pred,gender.cuda())
            #import pdb;pdb.set_trace()
            loss.backward()
            total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()
           
            batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum()
            #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum()
            #total_loss += loss.item() * v.size(0)
            train_score += batch_score.item()
            #train_score+=batch_score
        '''
        total_loss /= N
        train_score = 100 * train_score / N

        if None != eval_loader:
            model.train(False)
            eval_score, bound, _ = evaluate(model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))

        logger.write('\teval score: %.2f (%.2f)' %
                     (100 * eval_score, 100 * bound))

        model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
        utils.save_model(model_path, model, epoch, optim)
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = 0.01
    lr_decay_epochs = range(16,50,lr_decay_step) if eval_loader is not None else range(10,20,lr_decay_step)
    gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))
    
    
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        train_score_vqa=0
        total_norm = 0
        count_norm = 0
        total_fair_loss=0
        total_dis_loss=0
        woman=0
        woman_o=0
        man=0
        man_o=0
        other=0
        other_o=0
        t = time.time()
        N = len(train_loader.dataset)
        print(N)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] =optim.param_groups[0]['lr']*lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])
  
        for name,subnet in model.named_children():
            if name=='w_emb' or name=='q_emb' or name=='q_att' or name=='v_att' or name=='v_net' or name=='q_net' or name=='classifier2':
                print(name)
            
                for param in subnet.parameters():
                    param.requires_grad=False
       
    
        for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader):
            
            v = v.cuda()
            b = b.cuda()
            q = q.cuda()
            a = a.cuda()
            
            visual_pred, vqa_pred,att = model(v, b, q, a)
            
            #import pdb;pdb.set_trace()
            gender=gender.squeeze(1)
            weights=torch.Tensor([2.0,1.0,0.001]).cuda()
            vqa_loss = instance_bce_with_logits(vqa_pred, a)



         

            



            loss=nn.CrossEntropyLoss(weights)
            loss=loss(visual_pred,gender.cuda())
            #dis_loss=torch.abs(visual_pred[:,0]-visual_pred[:,1]).mean()
            #dis_loss=dis_loss.cuda()
            if epoch < 12:
                t_loss=vqa_loss
            else:    
                t_loss=loss+vqa_loss
            t_loss.backward()
            #import pdb;pdb.set_trace()
            #vp=visual_pred[:,:2].cuda()
            #g=g[:,:2]
            #crossloss=instance_bce_with_logits(vp,g.cuda())
            
            #mseloss=torch.nn.functional.mse_loss(vp.softmax(1),g.cuda())
            #g_swap=g[:,[1,0]].cuda()
            
            #swap_loss=(vp.softmax(1)*g_swap).sum(1)
            #swap_loss=swap_loss.sum()
           
            
            for j in range(len(v)):
            
                if gender[j]==0:
                    woman=woman+1
                    
                    #if visual_pred[j].argmax()==0 or visual_pred[j].argmax()==1:
                    if visual_pred[j].argmax()==gender[j].cuda():
                        woman_o=woman_o+1
            

                elif gender[j]==1:
                    #if visual_pred[j].argmax()==0 or visual_pred[j].argmax()==1:
                    man=man+1
                    
                    if visual_pred[j].argmax()==gender[j].cuda():
                        man_o=man_o+1
                else:
                    other=other+1

                    if visual_pred[j].argmax()==gender[j].cuda():
                        other_o=other_o+1
                   
            
            total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()
            #total_fair_loss+=soft_fair_loss
            #total_dis_loss+=dis_loss
            batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum()
            batch_score_vqa = compute_score_with_logits(vqa_pred, a.data).sum()

            #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum()
            #total_loss += loss.item() * v.size(0)
            train_score += batch_score.item()
            train_score_vqa+=batch_score_vqa.item()
            #train_score+=batch_score
            
            if i==0:
                print(loss)
                #print(10*soft_fair_loss)
                print("\n\n")
        total_loss /= N
        train_score = 100 * train_score / N
        train_score_vqa = 100 * train_score_vqa / N
        
        print("epoch",epoch)
        woman_score=float(woman_o)/woman
        man_score=float(man_o)/man
        other_score=float(other_o)/other
        print("woman",woman)
        print("man",man)
        print("other",other)
        print("train_woman_score",woman_score*100)
        print("train_man_score",man_score*100)
        print("train_other_score",other_score*100)
        print("vqa",train_score_vqa)
     
        if None != eval_loader:
            model.train(False)
            eval_score, bound, _ = evaluate(model, eval_loader)
            model.train(True)
        #print("total_fair_loss",total_fair_loss)
        #print("totla_dis_loss",total_dis_loss)
        logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score))
        #logger.write('\total_fair_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, total_fair_loss))
        
        logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound))

        
        
        
        model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
        utils.save_model(model_path, model, epoch, optim)
Example #16
0
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = .25
    lr_decay_epochs = range(10,20,lr_decay_step) if eval_loader is not None else range(10,20,lr_decay_step)
    gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = len(train_loader.dataset)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        for i, (v, b, q, a) in enumerate(train_loader):
            v = Variable(v).cuda()
            b = Variable(b).cuda()
            q = Variable(q).cuda()
            a = Variable(a).cuda()

            pred, att = model(v, b, q, a)
            loss = instance_bce_with_logits(pred, a)
            loss.backward()
            total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()

            batch_score = compute_score_with_logits(pred, a.data).sum()
            total_loss += loss.data[0] * v.size(0)
            train_score += batch_score

        total_loss /= N
        train_score = 100 * train_score / N
        if None != eval_loader:
            model.train(False)
            eval_score, bound, entropy = evaluate(model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound))

        if eval_loader is not None and entropy is not None:
            info = ''
            for i in range(entropy.size(0)):
                info = info + ' %.2f' % entropy[i]
            logger.write('\tentropy: ' + info)

        if (eval_loader is not None and eval_score > best_eval_score) or (eval_loader is None and epoch >= saving_epoch):
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #17
0
def train(model, train_loader, eval_loader, args, device=torch.device("cuda")):
    N = len(train_loader.dataset)
    lr_default = args.base_lr
    num_epochs = args.epochs
    lr_decay_epochs = range(args.lr_decay_start, num_epochs,
                            args.lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]

    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=lr_default,
                               betas=(0.9, 0.999),
                               eps=1e-8,
                               weight_decay=args.weight_decay)

    logger = utils.Logger(os.path.join(args.output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f,' %
                 (lr_default, args.lr_decay_step, args.lr_decay_rate) +
                 'grad_clip=%.2f' % args.grad_clip)
    logger.write('LR decay epochs: ' +
                 ','.join([str(i) for i in lr_decay_epochs]))
    last_eval_score, eval_score = 0, 0
    relation_type = train_loader.dataset.relation_type

    for epoch in range(0, num_epochs):
        pbar = tqdm(total=len(train_loader))
        total_norm, count_norm = 0, 0
        total_loss, train_score = 0, 0
        count, average_loss, att_entropy = 0, 0, 0
        t = time.time()
        if epoch < len(gradual_warmup_steps):
            for i in range(len(optim.param_groups)):
                optim.param_groups[i]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[-1]['lr'])
        elif (epoch in lr_decay_epochs
              or eval_score < last_eval_score and args.lr_decay_based_on_val):
            for i in range(len(optim.param_groups)):
                optim.param_groups[i]['lr'] *= args.lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[-1]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[-1]['lr'])
        last_eval_score = eval_score

        mini_batch_count = 0
        batch_multiplier = args.grad_accu_steps
        for i, (v, norm_bb, q, q_target, target, _, _, bb, spa_adj_matrix,
                sem_adj_matrix) in enumerate(train_loader):
            batch_size = v.size(0)
            num_objects = v.size(1)
            if mini_batch_count == 0:
                optim.step()
                optim.zero_grad()
                mini_batch_count = batch_multiplier

            ### Debugging ###
            # with autograd.detect_anomaly():
            v = Variable(v).to(device)
            norm_bb = Variable(norm_bb).to(device)
            q = Variable(q).to(device)
            q_target = Variable(q_target).to(device)
            target = Variable(target).to(device)
            pos_emb, sem_adj_matrix, spa_adj_matrix = prepare_graph_variables(
                relation_type, bb, sem_adj_matrix, spa_adj_matrix, num_objects,
                args.nongt_dim, args.imp_pos_emb_dim, args.spa_label_num,
                args.sem_label_num, device)
            q_type, pred, att = model(v, norm_bb, q, pos_emb, sem_adj_matrix,
                                      spa_adj_matrix, target)
            loss = instance_bce_with_logits(
                pred, target) + instance_bce_with_logits(q_type, q_target)

            loss /= batch_multiplier
            loss.backward()

            mini_batch_count -= 1
            total_norm += nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip)
            count_norm += 1
            batch_score = compute_score_with_logits(pred, target, device).sum()
            total_loss += loss.data.item() * batch_multiplier * v.size(0)
            train_score += batch_score
            pbar.update(1)

            if args.log_interval > 0:
                average_loss += loss.data.item() * batch_multiplier
                if model.module.fusion == "ban":
                    current_att_entropy = torch.sum(calc_entropy(att.data))
                    att_entropy += current_att_entropy / batch_size / att.size(
                        1)
                count += 1
                if i % args.log_interval == 0:
                    att_entropy /= count
                    average_loss /= count
                    print(
                        "step {} / {} (epoch {}), ave_loss {:.3f},".format(
                            i, len(train_loader), epoch, average_loss),
                        "att_entropy {:.3f}".format(att_entropy))
                    average_loss = 0
                    count = 0
                    att_entropy = 0

        total_loss /= N
        train_score = 100 * train_score / N
        if eval_loader is not None:
            eval_score, bound, entropy = evaluate(model, eval_loader, device,
                                                  args)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' %
                         (100 * eval_score, 100 * bound))

            if entropy is not None:
                info = ''
                for i in range(entropy.size(0)):
                    info = info + ' %.2f' % entropy[i]
                logger.write('\tentropy: ' + info)
        if (eval_loader is not None)\
           or (eval_loader is None and epoch >= args.saving_epoch):
            logger.write("saving current model weights to folder")
            model_path = os.path.join(args.output, 'model_%d.pth' % epoch)
            opt = optim if args.save_optim else None
            utils.save_model(model_path, model, epoch, opt)
Example #18
0
def main():
    global best_acc
    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data
    print('==> Preparing dataset %s' % args.dataset)
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    if args.dataset == 'cifar10':
        dataloader = datasets.CIFAR10
        num_classes = 10
    else:
        dataloader = datasets.CIFAR100
        num_classes = 100

    trainset = dataloader(root='./data',
                          train=True,
                          download=True,
                          transform=transform_train)
    trainloader = data.DataLoader(trainset,
                                  batch_size=args.train_batch,
                                  shuffle=True,
                                  num_workers=args.workers)

    testset = dataloader(root='./data',
                         train=False,
                         download=False,
                         transform=transform_test)
    testloader = data.DataLoader(testset,
                                 batch_size=args.test_batch,
                                 shuffle=False,
                                 num_workers=args.workers)

    # Model
    print("==> creating model '{}'".format(args.arch))
    if args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            cardinality=args.cardinality,
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.startswith('densenet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            growthRate=args.growthRate,
            compressionRate=args.compressionRate,
            dropRate=args.drop,
        )
    elif args.arch.startswith('wrn'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.endswith('resnet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            block_name=args.block_name,
        )
    else:
        model = models.__dict__[args.arch](num_classes=num_classes)

    print("Geometric LR: {}".format(args.geo_lr))

    model = torch.nn.DataParallel(model).cuda()
    cudnn.benchmark = True
    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))
    criterion = nn.CrossEntropyLoss()

    param_lr = GradientRatioScheduler.get_params_base_lr(model, args.lr)
    optimizer = optim.SGD(param_lr,
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    scheduler = GradientRatioScheduler(optimizer)

    print_model(model)
    input("Cont?")

    # Resume
    title = 'cifar-10-' + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_acc = checkpoint['best_acc']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Epoch', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.',
            'Time', 'Learning Rate'
        ])

    if args.evaluate:
        print('\nEvaluation only')
        test_loss, test_acc = test(testloader, model, criterion, start_epoch,
                                   use_cuda)
        print(' Test Loss:  %.8f, Test Acc:  %.2f' % (test_loss, test_acc))
        return

    # Train and val
    for epoch in range(start_epoch, args.epochs):
        adjust_learning_rate(scheduler, epoch)

        print('\nEpoch: [%d | %d] LR: %f' %
              (epoch + 1, args.epochs, max(scheduler.get_lr())))

        st = time.time()
        train_loss, train_acc = train(trainloader, model, criterion, optimizer,
                                      scheduler, epoch, use_cuda)
        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   use_cuda)

        # append logger file
        logger.append([
            epoch, train_loss, test_loss, train_acc, test_acc,
            time.time() - st,
            scheduler.get_lr()
        ])

        # save model
        is_best = test_acc > best_acc
        best_acc = max(test_acc, best_acc)
        if args.save_checkpoint_model:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'acc': test_acc,
                    'best_acc': best_acc,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                checkpoint=args.checkpoint)

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))

    print('Best acc:')
    print(best_acc)
Example #19
0
def debug(node):
    uci_model = client.get_uci_config()
    node_model = uci_model.find_child(node)
    return "<pre>%s</pre>" % websafe(print_model(node_model))
Example #20
0
def train(model, train_loader, eval_loader,output):
    lr_default=0.01
    grad_clip = .25
    epoch=0
    i_iter=0
    max_iter=45071
    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default)
    scheduler = get_optim_scheduler(optim)
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0
    utils.print_model(model, logger)

    while i_iter<max_iter:
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        epoch=epoch+1
        N = len(train_loader.dataset)
        logger.write('lr: %.4f' % optim.param_groups[0]['lr'])
        t=time.time()

        for i, (v, b, q, a) in enumerate(train_loader):
            i_iter=i_iter+1
            if i_iter>max_iter:
                break
            scheduler.step(i_iter)
            optim.zero_grad()
            v = Variable(v).cuda()
            b = Variable(b).cuda()
            q = Variable(q).cuda()
            a = Variable(a).cuda()

            pred= model(v, b, q, a)
            loss = instance_bce_with_logits(pred, a)
            loss.backward()
            total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            count_norm += 1

            batch_score = compute_score_with_logits(pred, a.data).sum()
            total_loss += loss.data[0] * v.size(0)
            train_score += batch_score
            #print('batch_score: %.2f' % (batch_score))
            #print(train_score)
            optim.step()

        total_loss /= N
        train_score = 100 * train_score / N
        if None != eval_loader:
            model.train(False)
            eval_score = evaluate(model, eval_loader)
            model.train(True)
        logger.write('epoch: %d, time: %.2f' % (epoch, time.time()-t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score))

        if eval_loader is not None:
            logger.write('\teval score: %.2f' % (100 * eval_score))
        if (eval_loader is not None and eval_score > best_eval_score):
            model_path = os.path.join(output, 'model_epoch%d.pth' % (epoch))
            utils.save_model(model_path, model, iter, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
def train(model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = .25
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = 0

        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        for i, (v, b, p, e, n, a, idx, types) in enumerate(train_loader):
            v = v.cuda()
            b = b.cuda()
            p = p.cuda()
            e = e.cuda()
            a = a.cuda()

            _, logits = model(v, b, p, e, a)
            n_obj = logits.size(2)
            logits.squeeze_()

            merged_logit = torch.cat(
                tuple(logits[j, :, :n[j][0]] for j in range(n.size(0))),
                -1).permute(1, 0)
            merged_a = torch.cat(
                tuple(a[j, :n[j][0], :n_obj] for j in range(n.size(0))), 0)

            loss = instance_bce_with_logits(merged_logit, merged_a,
                                            'sum') / v.size(0)
            N += n.sum().float()

            batch_score = compute_score_with_logits(merged_logit,
                                                    merged_a.data).sum()

            loss.backward()
            total_norm += nn.utils.clip_grad_norm_(model.parameters(),
                                                   grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()
            total_loss += loss.item() * v.size(0)
            train_score += batch_score.item()

        total_loss /= N
        train_score = 100 * train_score / N
        if None != eval_loader:
            model.train(False)
            eval_score, bound, entropy = evaluate(model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f/%.2f/%.2f (%.2f)' %
                         (100 * eval_score[0], 100 * eval_score[1],
                          100 * eval_score[2], 100 * bound))
            eval_score = eval_score[0]

        if eval_loader is not None and entropy is not None:
            info = ''
            for i in range(entropy.size(0)):
                info = info + ' %.2f' % entropy[i]
            logger.write('\tentropy: ' + info)

        if (eval_loader is not None and eval_score > best_eval_score) or (
                eval_loader is None and epoch >= saving_epoch):
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #22
0
def train6(model, train_loader, eval_loader, num_epochs, output,s_epoch=0):
    lr_default=0.001
    grad_clip = .25
    utils.create_dir(output)
    lr_decay_step = 2
    lr_decay_rate = .5
    lr_decay_epochs = range(9, 12, lr_decay_step)
    gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default,2.0*lr_default]
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default)
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0
    utils.print_model(model, logger)
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = len(train_loader.dataset)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0]['lr'] *= lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])
        for i, (v, b, q, a,image_id) in enumerate(train_loader):
            v = Variable(v).cuda()
            b = Variable(b).cuda()
            q = Variable(q).cuda()
            a = Variable(a).cuda()

            pred= model(v, b, q, a)
            loss = instance_bce_with_logits(pred, a)
            loss.backward()
            total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()

            batch_score = compute_score_with_logits(pred, a.data).sum()
            total_loss += loss.data[0] * v.size(0)
            train_score += batch_score

        total_loss /= N
        train_score = 100 * train_score / N
        if None != eval_loader:
            model.train(False)
            eval_score,eval_loss= evaluate3(model, eval_loader)
            model.train(True)

        logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f,eval loss:%.2f' % (100 * eval_score,eval_loss))

        if (eval_loader is not None and eval_score > best_eval_score) or (eval_loader is None and epoch>=0):
            model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #23
0
def run_trainer(data_loader: dict,
                model: models,
                optimizer: optim,
                lr_scheduler: optim.lr_scheduler,
                criterion: nn,
                train_epochs: int,
                log_training_progress_every: int,
                log_val_progress_every: int,
                checkpoint_every: int,
                tb_summaries_dir: str,
                chkpt_dir: str,
                resume_from: str,
                to_device: object,
                to_cpu: object,
                attackers: object = None,
                train_adv_periodic_ops: int = None,
                *args,
                **kwargs):
    def mk_lr_step(loss):
        lr_scheduler.step(loss)

    def train_step(engine, batch):
        model.train()
        optimizer.zero_grad()
        x, y = map(lambda _: to_device(_), batch)
        if (train_adv_periodic_ops is not None) and (
                engine.state.iteration % train_adv_periodic_ops == 0):
            random_attacker = random.choice(list(attackers))
            x = attackers[random_attacker].perturb(x, y)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        return loss.item()

    def eval_step(engine, batch):
        model.eval()
        with torch.no_grad():
            x, y = map(lambda _: to_device(_), batch)
            if random.choice(range(2)) % 2 == 0:
                random_attacker = random.choice(list(attackers))
                x = attackers[random_attacker].perturb(x, y)
            y_pred = model(x)
            return y_pred, y

    def chkpt_score_func(engine):
        val_eval.run(data_loader['val'])
        y_pred, y = val_eval.state.output
        loss = criterion(y_pred, y)
        return np.mean(to_cpu(loss, convert_to_np=True))

    # set up ignite engines
    trainer = Engine(train_step)
    train_eval = Engine(eval_step)
    val_eval = Engine(eval_step)

    @trainer.on(Events.ITERATION_COMPLETED(every=log_training_progress_every))
    def log_training_results(engine):
        step = True
        run_type = 'train'
        train_eval.run(data_loader['train'])
        y_pred, y = train_eval.state.output
        loss = criterion(y_pred, y)
        log_results(to_cpu(y_pred, convert_to_np=True),
                    to_cpu(y, convert_to_np=True),
                    to_cpu(loss, convert_to_np=True), run_type, step,
                    engine.state.iteration, total_train_steps, writer)

    @trainer.on(Events.ITERATION_COMPLETED(every=log_val_progress_every))
    def log_val_results(engine):
        step = True
        run_type = 'val'
        val_eval.run(data_loader['val'])
        y_pred, y = val_eval.state.output
        loss = criterion(y_pred, y)
        mk_lr_step(loss)
        log_results(to_cpu(y_pred, convert_to_np=True),
                    to_cpu(y, convert_to_np=True),
                    to_cpu(loss, convert_to_np=True), run_type, step,
                    engine.state.iteration, total_train_steps, writer)

    # set up vars
    total_train_steps = len(data_loader['train']) * train_epochs

    # reporter to identify memory usage
    # bottlenecks throughout network
    reporter = MemReporter()
    print_model(model, reporter)

    # set up tensorboard summary writer
    writer = create_summary_writer(model, data_loader['train'],
                                   tb_summaries_dir)
    # move model to device
    model = to_device(model)

    # set up progress bar
    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
    pbar = ProgressBar(persist=True, bar_format="")
    pbar.attach(trainer, ['loss'])

    # set up checkpoint
    objects_to_checkpoint = {
        'trainer': trainer,
        'model': model,
        'optimizer': optimizer,
        'lr_scheduler': lr_scheduler
    }
    training_checkpoint = Checkpoint(to_save=objects_to_checkpoint,
                                     save_handler=DiskSaver(
                                         chkpt_dir, require_empty=False),
                                     n_saved=3,
                                     filename_prefix='best',
                                     score_function=chkpt_score_func,
                                     score_name='val_loss')

    # register events
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED(every=checkpoint_every),
        training_checkpoint)

    # if resuming
    if resume_from and os.path.exists(resume_from):
        print(f'resume model from: {resume_from}')
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    # fire training engine
    trainer.run(data_loader['train'], max_epochs=train_epochs)
Example #24
0
def main(_):
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    if FLAGS.job_name == "ps":
        ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=0.00001))

        # Create and start a server for the local task.
        server = tf.train.Server(
            cluster,
            #                                protocol = "grpc_rdma",
            job_name=FLAGS.job_name,
            task_index=FLAGS.task_index,
            config=ps_config)
        server.join()
    elif FLAGS.job_name == "worker":

        # Create and start a server for the local task.
        server = tf.train.Server(
            cluster,
            #                                 protocol = "grpc_rdma",
            job_name=FLAGS.job_name,
            task_index=FLAGS.task_index)

        local_worker_device = "/job:worker/task:%d" % FLAGS.task_index
        with tf.device(
                tf.train.replica_device_setter(
                    ps_device='/job:ps/cpu:0',
                    worker_device=local_worker_device,
                    cluster=cluster)):

            if FLAGS.network == 'lstm':
                from models.lstm import KitModel
            elif FLAGS.network == 'gru':
                from models.gru import KitModel
            elif FLAGS.network == 'fc':
                from models.fullyconnect import KitModel
            elif FLAGS.network == 'alexnet':
                from models.alexnet import KitModel
            elif FLAGS.network == 'vgg16':
                from models.vgg16 import KitModel
            elif FLAGS.network == 'vgg19' or FLAGS.network == 'vgg_e':
                from models.vgg19 import KitModel
            elif FLAGS.network == 'inception_v3':
                from models.inception_v3 import KitModel
            elif FLAGS.network == 'resnet':
                from models.resnet import KitModel
            elif FLAGS.network == 'seq2seq':
                import models.translate.translate
                from models.translate.translate import dist_train
                dist_train(FLAGS, server, cluster)
                sys.exit()
            else:
                sys.exit("Invalid network [%s]" % args.network)

            this_model = KitModel(FLAGS)
            this_model.build_model()

        train_dir = tempfile.mkdtemp()

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=[
                "/job:ps", "/job:worker/task:%d" % FLAGS.task_index
            ],
            graph_options=tf.GraphOptions(
                optimizer_options=tf.OptimizerOptions(
                    opt_level=tf.OptimizerOptions.L1)),
            gpu_options=tf.GPUOptions(visible_device_list=""))

        if FLAGS.infer_shapes == True:
            sess_config.graph_options.infer_shapes = FLAGS.infer_shapes

        sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                 logdir=train_dir,
                                 init_op=tf.global_variables_initializer(),
                                 global_step=this_model.global_step,
                                 summary_writer=None,
                                 saver=None)

        if FLAGS.task_index == 0:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  FLAGS.task_index)

        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=sess_config,
                                              start_standard_services=True)

        print_model()

        print("Start warmup %d epoch." % FLAGS.warmup)
        for _ in range(FLAGS.warmup):
            this_model.get_data()
            sess.run(this_model.train_op, feed_dict=this_model.get_feed_dict())

        current_step = 0
        duration = 0
        while current_step < FLAGS.epoch:
            current_step += 1
            this_model.get_data()
            print("Start step %d" % current_step)
            start_time = time.time()
            _, step_loss = sess.run([this_model.train_op, this_model.cost],
                                    feed_dict=this_model.get_feed_dict())
            end_time = time.time()
            print(
                "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds"
                % (current_step, step_loss, FLAGS.batch_size /
                   (end_time - start_time), end_time - start_time))
            duration += end_time - start_time

        print("Total Time = %f s." % duration)
        #writer.close()

    else:
        sys.exit("Invalid job role name [%s]!" % args.job_name)
Example #25
0
def train(model, train_loader, eval_loader, opt):
    utils.create_dir(opt.output)
    optim = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-08,
                             weight_decay=opt.weight_decay)

    
    logger = utils.Logger(os.path.join(opt.output, 'log.txt'))

    utils.print_model(model, logger)
    for param_group in optim.param_groups:
        param_group['lr'] = opt.learning_rate

    scheduler = MultiStepLR(optim, milestones=[100], gamma=0.8)

    scheduler.last_epoch = opt.s_epoch

    

    best_eval_score = 0
    for epoch in range(opt.s_epoch, opt.num_epochs):
        total_loss = 0
        total_norm = 0
        count_norm = 0
        train_score = 0
        t = time.time()
        N = len(train_loader.dataset)
        scheduler.step()

        for i, (v, b, a, _, qa_text, _, _, q_t, bias) in enumerate(train_loader):
            v = v.cuda()
            b = b.cuda()
            a = a.cuda()
            bias = bias.cuda()
            qa_text = qa_text.cuda()
            rand_index = random.sample(range(0, opt.train_candi_ans_num), opt.train_candi_ans_num)
            qa_text = qa_text[:,rand_index,:]
            a = a[:,rand_index]
            bias = bias[:,rand_index]

            if opt.lp == 0:
                logits = model(qa_text, v, b, epoch, 'train')
                loss = instance_bce_with_logits(logits, a, reduction='mean')
            elif opt.lp == 1:
                logits = model(qa_text, v, b, epoch, 'train')
                loss_pos = instance_bce_with_logits(logits, a, reduction='mean')
                index = random.sample(range(0, v.shape[0]), v.shape[0])
                v_neg = v[index]
                b_neg = b[index]
                logits_neg = model(qa_text, v_neg, b_neg, epoch, 'train')
                self_loss = compute_self_loss(logits_neg, a)
                loss = loss_pos + opt.self_loss_weight * self_loss
            elif opt.lp == 2:
                logits, loss = model(qa_text, v, b, epoch, 'train', bias, a)
            else:
                assert 1==2
           
            loss.backward()

            total_norm += nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
            count_norm += 1

            optim.step()
            optim.zero_grad()

            score = compute_score_with_logits(logits, a.data).sum()
            train_score += score.item()
            total_loss += loss.item() * v.size(0)

            if i != 0 and i % 100 == 0:
                print(
                    'training: %d/%d, train_loss: %.6f, train_acc: %.6f' %
                    (i, len(train_loader), total_loss / (i * v.size(0)),
                     100 * train_score / (i * v.size(0))))
        total_loss /= N
        if None != eval_loader:
            model.train(False)
            eval_score, bound = evaluate(model, eval_loader, opt)
            model.train(True)

        logger.write('\nlr: %.7f' % optim.param_groups[0]['lr'])
        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write(
            '\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound))


        if (eval_loader is not None and eval_score > best_eval_score):
            if opt.lp == 0:
                model_path = os.path.join(opt.output, 'SAR_top'+str(opt.train_candi_ans_num)+'_best_model.pth')
            elif opt.lp == 1:
                model_path = os.path.join(opt.output, 'SAR_SSL_top'+str(opt.train_candi_ans_num)+'_best_model.pth')
            elif opt.lp == 2:
                model_path = os.path.join(opt.output, 'SAR_LMH_top'+str(opt.train_candi_ans_num)+'_best_model.pth')
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #26
0
                                 args.relation_type,
                                 adaptive=args.adaptive,
                                 pos_emb_dim=args.imp_pos_emb_dim,
                                 dataroot=args.data_folder)
    train_dset = VQAFeatureDataset('train',
                                   dictionary,
                                   args.relation_type,
                                   adaptive=args.adaptive,
                                   pos_emb_dim=args.imp_pos_emb_dim,
                                   dataroot=args.data_folder)

    # 5. Initialize ReGAT_all
    print("[LOG] 5. Initializing ReGAT_all...")
    model = build_regat_all(val_dset, args).to(device)
    logger = utils.Logger(os.path.join(args.output, 'model_all_log.txt'))
    utils.print_model(model, logger)

    # 6. tfidf
    # Takes around 4 minutes
    print("[LOG] 6. tfidf_from_questions...")
    tfidf = None
    weights = None
    if args.tfidf:
        tfidf, weights = tfidf_from_questions(['train', 'val', 'test2015'],
                                              dictionary)

    # 7. Initialize word embeddings
    print("[LOG] 7. Initializing word embeddings...")
    model.w_emb.init_embedding(
        join(args.data_folder, 'glove/glove6b_init_300d.npy'), tfidf, weights)
Example #27
0
def train(model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    lr_default = 1e-3 if eval_loader is not None else 7e-4
    lr_decay_step = 2
    lr_decay_rate = 0.25
    lr_decay_epochs = range(
        10, 20, lr_decay_step) if eval_loader is not None else range(
            10, 20, lr_decay_step)
    gradual_warmup_steps = [
        0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default
    ]
    saving_epoch = 3
    grad_clip = .25

    utils.create_dir(output)
    optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \
        if opt is None else opt
    logger = utils.Logger(os.path.join(output, 'log.txt'))
    best_eval_score = 0

    utils.print_model(model, logger)
    logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \
        (lr_default, lr_decay_step, lr_decay_rate, grad_clip))
    import pickle as pkl
    lab2ans = pkl.load(open("./data/cache/trainval_label2ans.pkl", 'rb'))
    woman_answer_words = [
        'woman', 'women', 'female', 'girl', 'lady', 'she', 'her', 'hers',
        'ladies', 'girls'
    ]
    man_answer_words = [
        'man', 'men', 'male', 'boy', 'he', 'his', 'gentleman', 'gentlemen',
        'boys'
    ]
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        train_score_vqa = 0
        total_norm = 0
        count_norm = 0
        total_fair_loss = 0
        total_dis_loss = 0
        woman = 0
        woman_true = 0
        man = 0
        woman_man = 0
        man_woman = 0
        man_true = 0
        other = 0.0001
        other_o = 0
        t = time.time()
        N = len(train_loader.dataset)
        print(N)
        if epoch < len(gradual_warmup_steps):
            optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch]
            logger.write('gradual warmup lr: %.4f' %
                         optim.param_groups[0]['lr'])
        elif epoch in lr_decay_epochs:
            optim.param_groups[0][
                'lr'] = optim.param_groups[0]['lr'] * lr_decay_rate
            logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr'])
        else:
            logger.write('lr: %.4f' % optim.param_groups[0]['lr'])

        for i, (v, b, q, a, ques, im, g, gender) in enumerate(train_loader):

            v = v.cuda()
            b = b.cuda()
            q = q.cuda()
            a = a.cuda()

            visual_pred, vqa_pred, att = model(v, b, q, a)

            #import pdb;pdb.set_trace()
            gender = gender.squeeze(1)
            weights = torch.Tensor([2.0, 1.0, 0.001]).cuda()
            vqa_loss = instance_bce_with_logits(vqa_pred, a)
            loss = 0
            #loss=nn.CrossEntropyLoss(weights)
            #loss=loss(visual_pred,gender.cuda())
            #dis_loss=torch.abs(visual_pred[:,0]-visual_pred[:,1]).mean()
            #dis_loss=dis_loss.cuda()
            if epoch < 30:
                t_loss = vqa_loss
            else:
                t_loss = loss + vqa_loss
            t_loss.backward()
            #import pdb;pdb.set_trace()
            #vp=visual_pred[:,:2].cuda()
            #g=g[:,:2]
            #crossloss=instance_bce_with_logits(vp,g.cuda())

            #mseloss=torch.nn.functional.mse_loss(vp.softmax(1),g.cuda())
            #g_swap=g[:,[1,0]].cuda()

            #swap_loss=(vp.softmax(1)*g_swap).sum(1)
            #swap_loss=swap_loss.sum()

            #import pdb;pdb.set_trace()

            for j in range(len(v)):
                if gender[j] == 0:
                    woman = woman + 1
                    check = 0
                    for woman_answer in woman_answer_words:
                        if lab2ans[int(vqa_pred[j].argmax())] == woman_answer:
                            check = 1
                    if check == 1:
                        woman_true = woman_true + 1
                        check = 0

                    for man_answer in man_answer_words:
                        if lab2ans[int(vqa_pred[j].argmax())] == man_answer:
                            check = 1
                    if check == 1:
                        woman_man = woman_man + 1
                        check = 0

                if gender[j] == 1:
                    man = man + 1
                    check = 0
                    for man_answer in man_answer_words:
                        if lab2ans[int(vqa_pred[j].argmax())] == man_answer:
                            check = 1
                    if check == 1:
                        man_true = man_true + 1
                        check = 0
                    for woman_answer in woman_answer_words:
                        if lab2ans[int(vqa_pred[j].argmax())] == woman_answer:
                            check = 1
                    if check == 1:
                        man_woman = man_woman + 1
                        check = 0

            total_norm += nn.utils.clip_grad_norm_(model.parameters(),
                                                   grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()
            #total_fair_loss+=soft_fair_loss
            #total_dis_loss+=dis_loss
            #batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum()
            batch_score_vqa = compute_score_with_logits(vqa_pred, a.data).sum()

            #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum()
            #total_loss += loss.item() * v.size(0)
            #train_score += batch_score.item()
            train_score_vqa += batch_score_vqa.item()
            #train_score+=batch_score

            if i == 50 or i == 100 or i == 500:
                print(loss)
                #print(10*soft_fair_loss)
                print("\n\n")

        total_loss /= N
        train_score = 100 * train_score / N
        train_score_vqa = 100 * train_score_vqa / N
        #import pdb;pdb.set_trace()

        print("epoch", epoch)
        woman_score = float(woman_true) / woman
        man_score = float(man_true) / man
        #other_score=float(other_o)/other
        print("woman", woman)
        print("man", man)
        print("other", other)
        print("train_woman_score", woman_score * 100)
        print("train_man_score", man_score * 100)
        #print("train_other_score",other_score*100)

        print("vqa", train_score_vqa)
        print("\n\n")

        if None != eval_loader:
            model.train(False)
            eval_score, bound, _ = evaluate(model, eval_loader)
            model.train(True)
        #print("total_fair_loss",total_fair_loss)
        #print("totla_dis_loss",total_dis_loss)
        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score))
        #logger.write('\total_fair_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, total_fair_loss))

        logger.write('\teval score: %.2f (%.2f)' %
                     (100 * eval_score, 100 * bound))

        model_path = os.path.join(output, 'model_epoch%d.pth' % epoch)
        utils.save_model(model_path, model, epoch, optim)
Example #28
0
def train(model, train_loader, eval_loader, opt):

    utils.create_dir(opt.output)
    optim = torch.optim.Adam(model.parameters(),
                             lr=opt.learning_rate,
                             betas=(0.9, 0.999),
                             eps=1e-08,
                             weight_decay=opt.weight_decay)
    logger = utils.Logger(os.path.join(opt.output, 'log.txt'))

    utils.print_model(model, logger)

    # load snapshot
    if opt.checkpoint_path is not None:
        print('loading %s' % opt.checkpoint_path)
        model_data = torch.load(opt.checkpoint_path)
        model.load_state_dict(model_data.get('model_state', model_data))
        optim.load_state_dict(model_data.get('optimizer_state', model_data))
        opt.s_epoch = model_data['epoch'] + 1

    for param_group in optim.param_groups:
        param_group['lr'] = opt.learning_rate

    scheduler = MultiStepLR(optim,
                            milestones=[10, 15, 20, 25, 30, 35],
                            gamma=0.5)
    scheduler.last_epoch = opt.s_epoch

    best_eval_score = 0
    for epoch in range(opt.s_epoch, opt.num_epochs):
        total_loss = 0
        total_bce_loss = 0
        self_loss = 0
        total_self_loss = 0
        train_score_pos = 0
        train_score_neg = 0
        total_norm = 0
        count_norm = 0
        t = time.time()
        N = len(train_loader.dataset)
        scheduler.step()

        for i, (v, b, q, a, _) in enumerate(train_loader):
            v = v.cuda()
            q = q.cuda()
            a = a.cuda()

            # for the labeled samples
            if epoch < opt.pretrain_epoches:
                logits_pos, _ = model(q, v, False)
                if opt.ml_loss:
                    bce_loss_pos = instance_bce_with_logits(logits_pos,
                                                            a,
                                                            reduction='mean')
                else:
                    bce_loss_pos = instance_bce(logits_pos, a)
                loss = bce_loss_pos
            else:
                logits_pos, logits_neg, _, _ = model(q, v, True)
                if opt.ml_loss:  #use multi-label loss
                    bce_loss_pos = instance_bce_with_logits(logits_pos,
                                                            a,
                                                            reduction='mean')
                else:  #use cross-entropy loss
                    bce_loss_pos = instance_bce(logits_pos, a)

                self_loss = compute_self_loss(logits_neg, a)

                loss = bce_loss_pos + opt.self_loss_weight * self_loss

            loss.backward()
            total_norm += nn.utils.clip_grad_norm_(model.parameters(),
                                                   opt.grad_clip)
            count_norm += 1
            optim.step()
            optim.zero_grad()

            score_pos = compute_score_with_logits(logits_pos, a.data).sum()
            train_score_pos += score_pos.item()
            total_loss += loss.item() * v.size(0)
            total_bce_loss += bce_loss_pos.item() * v.size(0)

            if epoch < opt.pretrain_epoches:  #pretrain
                total_self_loss = 0
                train_score_neg = 0
            else:  #fintune
                score_neg = compute_score_with_logits(logits_neg, a.data).sum()
                total_self_loss += self_loss.item() * v.size(0)
                train_score_neg += score_neg.item()
            if i != 0 and i % 100 == 0:
                print(
                    'traing: %d/%d, train_loss: %.6f, bce_loss: %.6f, self_loss: %.6f, neg_train_acc: %.6f, pos_train_acc: %.6f'
                    %
                    (i, len(train_loader), total_loss /
                     (i * v.size(0)), total_bce_loss /
                     (i * v.size(0)), total_self_loss /
                     (i * v.size(0)), 100 * train_score_neg /
                     (i * v.size(0)), 100 * train_score_pos / (i * v.size(0))))

        total_loss /= N
        total_bce_loss /= N
        total_self_loss /= N
        train_score_pos = 100 * train_score_pos / N
        if None != eval_loader:
            model.train(False)
            eval_score, bound, entropy = evaluate(model, eval_loader)
            model.train(True)

        logger.write('\nlr: %.7f' % optim.param_groups[0]['lr'])
        logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t))
        logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' %
                     (total_loss, total_norm / count_norm, train_score_pos))
        if eval_loader is not None:
            logger.write('\teval score: %.2f (%.2f)' %
                         (100 * eval_score, 100 * bound))

        if eval_loader is not None and entropy is not None:
            info = '' + ' %.2f' % entropy
            logger.write('\tentropy: ' + info)

        if (eval_loader is not None and eval_score > best_eval_score):
            model_path = os.path.join(opt.output, 'best_model.pth')
            utils.save_model(model_path, model, epoch, optim)
            if eval_loader is not None:
                best_eval_score = eval_score
Example #29
0
def train(args,
          model,
          train_loader,
          eval_loader,
          num_epochs,
          output,
          opt=None,
          s_epoch=0):
    device = args.device
    # Scheduler learning rate
    lr_default = args.lr
    lr_decay_step = 2
    lr_decay_rate = 0.75
    lr_decay_epochs = (range(10, 20, lr_decay_step) if eval_loader is not None
                       else range(10, 20, lr_decay_step))
    gradual_warmup_steps = [
        0.5 * lr_default,
        1.0 * lr_default,
        1.5 * lr_default,
        2.0 * lr_default,
    ]
    saving_epoch = 15  # Start point for model saving
    grad_clip = args.clip_norm

    utils.create_dir(output)

    # Adamax optimizer
    optim = (torch.optim.Adamax(filter(lambda p: p.requires_grad,
                                       model.parameters()),
                                lr=lr_default) if opt is None else opt)

    # Loss function
    criterion = torch.nn.BCEWithLogitsLoss(reduction="sum")
    ae_criterion = torch.nn.MSELoss()

    # write hyper-parameter to log file
    logger = utils.Logger(os.path.join(output, "log.txt"))
    logger.write(args.__repr__())
    utils.print_model(model, logger)
    logger.write(
        "optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f"
        % (lr_default, lr_decay_step, lr_decay_rate, grad_clip))

    # create trainer
    trainer = Trainer(args, model, criterion, optim, ae_criterion)
    update_freq = int(args.update_freq)
    wall_time_start = time.time()

    best_eval_score = 0
    # Epoch passing in training phase
    for epoch in range(s_epoch, num_epochs):
        total_loss = 0
        train_score = 0
        total_norm = 0
        count_norm = 0
        num_updates = 0
        t = time.time()
        N = len(train_loader.dataset)
        num_batches = int(N / args.batch_size + 1)
        if epoch < len(gradual_warmup_steps):
            trainer.optimizer.param_groups[0]["lr"] = gradual_warmup_steps[
                epoch]
            logger.write("gradual warm up lr: %.4f" %
                         trainer.optimizer.param_groups[0]["lr"])
        elif epoch in lr_decay_epochs:
            trainer.optimizer.param_groups[0]["lr"] *= lr_decay_rate
            logger.write("decreased lr: %.4f" %
                         trainer.optimizer.param_groups[0]["lr"])
        else:
            logger.write("lr: %.4f" % trainer.optimizer.param_groups[0]["lr"])

        # Predicting and computing score
        for i, (v, q, a, _, _, _) in enumerate(train_loader):
            if args.maml:
                v[0] = v[0].reshape(v[0].shape[0], 84, 84).unsqueeze(1)
            if args.autoencoder:
                v[1] = v[1].reshape(v[1].shape[0], 128, 128).unsqueeze(1)
            v[0] = v[0].to(device)
            v[1] = v[1].to(device)
            q = q.to(device)
            a = a.to(device)
            sample = [v, q, a]

            if i < num_batches - 1 and (i + 1) % update_freq > 0:
                trainer.train_step(sample, update_params=False)
            else:
                loss, grad_norm, batch_score = trainer.train_step(
                    sample, update_params=True)
                total_norm += grad_norm
                count_norm += 1

                total_loss += loss.item()
                train_score += batch_score
                num_updates += 1
                if num_updates % int(args.print_interval / update_freq) == 0:
                    print(
                        "Iter: {}, Loss {:.4f}, Norm: {:.4f}, Total norm: {:.4f}, Num updates: {}, Wall time: {:.2f}, ETA: {}"
                        .format(
                            i + 1,
                            total_loss / ((num_updates + 1)),
                            grad_norm,
                            total_norm,
                            num_updates,
                            time.time() - wall_time_start,
                            utils.time_since(t, i / num_batches),
                        ))

        total_loss /= num_updates
        train_score = 100 * train_score / (num_updates * args.batch_size)

        # Evaluation
        if eval_loader is not None:
            print("Evaluating...")
            trainer.model.train(False)
            eval_score, bound = evaluate(model, eval_loader, args)
            trainer.model.train(True)

        logger.write("epoch %d, time: %.2f" % (epoch, time.time() - t))
        logger.write("\ttrain_loss: %.2f, norm: %.4f, score: %.2f" %
                     (total_loss, total_norm / count_norm, train_score))
        if eval_loader is not None:
            logger.write("\teval score: %.2f (%.2f)" %
                         (100 * eval_score, 100 * bound))

        # Save per epoch
        if epoch >= saving_epoch:
            model_path = os.path.join(output, "model_epoch%d.pth" % epoch)
            utils.save_model(model_path, model, epoch, trainer.optimizer)
            # Save best epoch
            if eval_loader is not None and eval_score > best_eval_score:
                model_path = os.path.join(output, "model_epoch_best.pth")
                utils.save_model(model_path, model, epoch, trainer.optimizer)
                best_eval_score = eval_score
Example #30
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch MNIST Example',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dataset',
                        type=str,
                        default='data/mnist.npy',
                        help='path to dataset')
    parser.add_argument('--batch-size',
                        type=int,
                        default=100,
                        metavar='N',
                        help='input batch size for training')
    parser.add_argument('--epochs',
                        type=int,
                        default=101,
                        metavar='N',
                        help='number of epochs to train')
    parser.add_argument('--LR',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate')
    parser.add_argument('--L2',
                        type=float,
                        default=0.0001,
                        metavar='L2',
                        help='L2 weight decay strength')
    parser.add_argument('--L1_1',
                        type=float,
                        default=5e-4,
                        metavar='L2',
                        help='L1 weight decay strength')
    parser.add_argument('--L1_2',
                        type=float,
                        default=1e-5,
                        metavar='L2',
                        help='L1 weight decay strength')
    parser.add_argument('--L3',
                        type=float,
                        default=0.05,
                        metavar='L3',
                        help='gradient decay strength')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='SGD momentum')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed')
    parser.add_argument('--use_bias',
                        dest='use_bias',
                        action='store_true',
                        help='use biases')
    parser.add_argument('--q_a',
                        type=int,
                        default=4,
                        metavar='S',
                        help='quantize activations to this number of bits')
    parser.add_argument('--act_max',
                        type=float,
                        default=1.0,
                        help='clipping threshold for activations')
    parser.add_argument('--w_max',
                        type=float,
                        default=0.,
                        help='clipping threshold for weights')
    parser.add_argument('--stochastic',
                        type=float,
                        default=0.5,
                        help='stochastic quantization')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='debug')
    parser.add_argument('--calculate_running',
                        dest='calculate_running',
                        action='store_true',
                        help='calculate_running')
    parser.add_argument('--plot',
                        dest='plot',
                        action='store_true',
                        help='plot')
    parser.add_argument('--save',
                        dest='save',
                        action='store_true',
                        help='save')
    parser.add_argument('--bn1', dest='bn1', action='store_true', help='bn1')
    parser.add_argument('--bn2', dest='bn2', action='store_true', help='bn2')
    parser.add_argument('--track_running_stats',
                        dest='track_running_stats',
                        action='store_true',
                        help='track_running_stats')
    parser.add_argument('--augment',
                        dest='augment',
                        action='store_true',
                        help='augment')
    parser.add_argument('--triple_input',
                        dest='triple_input',
                        action='store_true',
                        help='triple_input')
    parser.add_argument('--dropout_input',
                        type=float,
                        default=0.2,
                        help='dropout_input drop prob')
    parser.add_argument('--dropout_act',
                        type=float,
                        default=0.4,
                        help='dropout_act drop prob')
    parser.add_argument('--prune_weights1',
                        type=float,
                        default=0.0,
                        help='percentage of smallest weights to set to zero')
    parser.add_argument('--prune_weights2',
                        type=float,
                        default=0.0,
                        help='percentage of smallest weights to set to zero')
    parser.add_argument('--prune_epoch',
                        type=float,
                        default=90,
                        help='do pruning at the end of this epoch')
    parser.add_argument('--var_name', type=str, default='', help='var_name')
    parser.add_argument('--gpu', type=str, default=None, help='gpu')
    parser.add_argument('--num_sims',
                        type=int,
                        default=1,
                        help='number of simulation runs')
    args = parser.parse_args()

    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    np.set_printoptions(precision=4, linewidth=200, suppress=True)

    data = np.load(args.dataset, allow_pickle=True)
    train_data, val_data = data
    train_inputs, train_labels = train_data
    test_inputs, test_labels = val_data
    train_inputs = torch.from_numpy(train_inputs).cuda()
    train_labels = torch.from_numpy(train_labels).cuda()
    test_inputs = torch.from_numpy(test_inputs).cuda()
    test_labels = torch.from_numpy(test_labels).cuda()

    results = {}

    if args.var_name == 'L1_1':
        var_list = [
            0, 1e-6, 2e-6, 3e-6, 5e-6, 7e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5,
            7e-5, 1e-4, 2e-4
        ]
    elif args.var_name == 'L1_2':
        var_list = [
            0, 1e-6, 2e-6, 3e-6, 5e-6, 7e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5,
            7e-5, 1e-4, 2e-4
        ]
    elif args.var_name == 'L3':
        var_list = [
            0, 0.001, 0.002, 0.003, 0.005, 0.007, 0.01, 0.02, 0.03, 0.04, 0.05,
            0.06, 0.08, 0.1, 0.2
        ]
    elif args.var_name == 'L2':
        var_list = [
            0, 5e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 7e-5, 1e-4, 2e-4, 3e-4,
            4e-4, 5e-4, 0.001
        ]
    else:
        var_list = [' ']

    total_list = []

    for var in var_list:
        if args.var_name != '':
            print('\n\n********** Setting {} to {} **********\n\n'.format(
                args.var_name, var))
            setattr(args, args.var_name, var)

        results[var] = []
        best_accs = []

        for s in range(args.num_sims):
            model = Net(args).cuda()
            optimizer = optim.SGD(model.parameters(),
                                  lr=args.LR,
                                  momentum=args.momentum,
                                  weight_decay=args.L2)
            num_train_batches = int(len(train_inputs) / args.batch_size)
            best_acc = 0

            if s == 0:
                utils.print_model(model, args)

            for epoch in range(args.epochs):

                rnd_idx = np.random.permutation(len(train_inputs))
                train_inputs = train_inputs[rnd_idx]
                train_labels = train_labels[rnd_idx]

                if epoch % 70 == 0 and epoch != 0:
                    print('\nReducing learning rate ')
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = param_group['lr'] / 10.
                train_acc = train(args, model, num_train_batches, train_inputs,
                                  train_labels, optimizer)
                val_acc = test(model, test_inputs, test_labels)

                if (args.prune_weights1 > 0 or args.prune_weights2 > 0
                    ) and epoch % args.prune_epoch == 0 and epoch != 0:
                    print('\n\nAccuracy before pruning: {:.2f}\n\n'.format(
                        val_acc))
                    sparsities = prune_weights(args, model)
                    val_acc = test(model, test_inputs, test_labels)
                    print('\n\nAccuracy after pruning: {:.2f}\n\n'.format(
                        val_acc))
                else:
                    sparsities = [
                        p.data[torch.abs(p.data) < 0.01 *
                               p.data.max()].numel() / p.data.numel() * 100.0
                        for _, p in model.named_parameters()
                    ]
                print(
                    'Epoch {:>2d} train acc {:>.2f} test acc {:>.2f}  LR {:.4f}  sparsity {:>3.1f} {:>3.1f}'
                    .format(epoch, train_acc, val_acc,
                            optimizer.param_groups[0]['lr'], sparsities[0],
                            sparsities[1]))
                if val_acc > best_acc:
                    best_acc = val_acc

                    if epoch > 80 and (args.save or args.plot):

                        sparsities = prune_weights(args, model)
                        val_acc = test(model, test_inputs, test_labels)
                        print('\n\nAccuracy after pruning: {:.2f}\n\n'.format(
                            val_acc))

                        w_pos = model.fc1.weight.clone()
                        w_pos[w_pos < 0] = 0
                        w_neg = model.fc1.weight.clone()
                        w_neg[w_neg >= 0] = 0
                        pos = F.linear(model.quantized_input, w_pos)
                        neg = F.linear(model.quantized_input, w_neg)
                        sep1 = torch.cat((neg, pos), 0)

                        w_pos = model.fc2.weight.clone()
                        w_pos[w_pos < 0] = 0
                        w_neg = model.fc2.weight.clone()
                        w_neg[w_neg >= 0] = 0
                        pos = F.linear(model.act, w_pos)
                        neg = F.linear(model.act, w_neg)
                        sep2 = torch.cat((neg, pos), 0)

                        dict_names = [
                            'input', 'fc1_weights', 'preact', 'diff_preact',
                            'act', 'fc2_weights', 'output', 'diff_output'
                        ]
                        tensors = [
                            model.quantized_input, model.fc1.weight,
                            model.preact, sep1, model.act, model.fc2.weight,
                            model.output, sep2
                        ]
                        shapes = [list(t.shape) for t in tensors]
                        arrays = [
                            t.detach().cpu().half().numpy() for t in tensors
                        ]
                        mlp_dict = {
                            key: value
                            for key, value in zip(dict_names, shapes)
                        }
                        if args.save:
                            print('\n\nSaving MLP:\n{}\n'.format(mlp_dict))
                            # np.save('mlp.npy', arrays[1:])
                            # scipy.io.savemat('chip_plots/mnist_val.mat', mdict={key: value for key, value in zip(names[:], values[:])})
                            # scipy.io.savemat('chip_plots/mnist_labels.mat', mdict={'mnist_test_labels': test_labels.detach().cpu().numpy()})
                            # print('\nLabels:', test_labels.detach().cpu().numpy().shape, test_labels.detach().cpu().numpy()[:20], '\n\n')
                            scipy.io.savemat(
                                'chip_plots/mlp.mat',
                                mdict={
                                    key: value
                                    for key, value in zip(
                                        dict_names[1:], arrays[1:])
                                })
                            # scipy.io.savemat('chip_plots/mlp_first_layer_q4_act_1_acc_.mat', mdict={dict_names[2]: arrays[2], dict_names[3]: arrays[3]})

                        if args.plot:
                            names = [
                                'input', 'weights', 'output', 'diff_output'
                            ]
                            layers = []
                            layer = []
                            print('\n\nlen(arrays) // len(names):',
                                  len(arrays), len(names),
                                  len(arrays) // len(names), '\n\n')
                            num_layers = len(arrays) // len(names)
                            for k in range(num_layers):
                                print('layer', k, names)
                                for j in range(len(names)):
                                    layer.append([arrays[len(names) * k + j]])
                                layers.append(layer)
                                layer = []

                            info = []
                            neuron_inputs = []
                            for n, p in model.named_parameters():
                                if 'weight' in n:
                                    neuron_inputs.append(np.prod(p.shape[1:]))

                            for idx in range(len(neuron_inputs)):
                                temp = []
                                temp.append('{:d} neuron inputs '.format(
                                    neuron_inputs[idx]))
                                #if args.plot_power:
                                #temp.append('{:.2f}mW '.format(self.power[idx][0]))
                                info.append(temp)

                            if args.plot:
                                print('\nPlotting {}\n'.format(names))
                                plot_layers(num_layers=len(layers),
                                            models=['chip_plots/'],
                                            epoch=epoch,
                                            i=0,
                                            layers=layers,
                                            names=names,
                                            var='',
                                            vars=[''],
                                            infos=info,
                                            pctl=99.9,
                                            acc=val_acc)

                            #plot_grid([[[v] for v in values]], ['input', 'quantized_input', 'weights', 'output'], path='chip_plots/epoch_' + str(epoch), filename='_mlp_histograms.png')
                            #layers = [[[a1, aa1], [a2, aa2]]]
                            #raise(SystemExit)
            if args.plot and os.path.exists('chip_plots/mlp.mat'):
                os.rename(
                    r'chip_plots/mlp.mat',
                    r'chip_plots/mlp_act_max_{:.1f}_w_max_{:.1f}_L2_{:.4f}_L3_{:.1f}_drop_{:.2f}_{:.2f}_LR_{:.3f}_acc_{:.2f}.mat'
                    .format(args.act_max, args.w_max, args.L2, args.L3,
                            args.dropout_input, args.dropout_act, args.LR,
                            best_acc))

            print('\nSimulation {:d}  Best Accuracy: {:.2f}\n\n'.format(
                s, best_acc))
            best_accs.append(best_acc)

        total_list.append(
            (np.mean(best_accs), np.min(best_accs), np.max(best_accs)))
        print('\n{:d} runs:  {} {} {:.2f} ({:.2f}/{:.2f})\n'.format(
            args.num_sims, args.var_name, var, *total_list[-1]))

    print('\n\n')
    for var, (mean, min, max) in zip(var_list, total_list):
        print('{} {:>5} acc {:.2f} ({:.2f}/{:.2f})'.format(
            args.var_name, var, mean, min, max))
    print('\n\n')
Example #31
0
def main(_):
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    if FLAGS.job_name == "ps":
        ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=0.01))

        # Create and start a server for the local task.
        server = tf.train.Server(
            cluster,
            #                                protocol = "grpc_rdma",
            job_name=FLAGS.job_name,
            task_index=FLAGS.task_index,
            config=ps_config)
        server.join()
    elif FLAGS.job_name == "worker":

        # Create and start a server for the local task.
        server = tf.train.Server(
            cluster,
            #                                 protocol = "grpc+verbs",
            job_name=FLAGS.job_name,
            task_index=FLAGS.task_index)

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.data_dir)

        #####################################
        # Select the preprocessing function #
        #####################################
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            FLAGS.network, is_training=True)

        ######################
        # Select the network #
        ######################
        network_fn = nets_factory.get_network_fn(FLAGS.network,
                                                 FLAGS.num_classes,
                                                 is_training=True)

        if FLAGS.dataset_name != "synthetic":
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size)

            [image, label] = provider.get(['image', 'label'])

            image = image_preprocessing_fn(image,
                                           network_fn.default_image_size,
                                           network_fn.default_image_size)

            images, labels = tf.train.batch([image, label],
                                            batch_size=FLAGS.batch_size,
                                            num_threads=4,
                                            capacity=5 * FLAGS.batch_size)
        else:
            images = random_ops.random_uniform(
                (FLAGS.batch_size, network_fn.default_image_size,
                 network_fn.default_image_size, 3),
                maxval=1)
            labels = random_ops.random_uniform((FLAGS.batch_size, ),
                                               maxval=FLAGS.num_classes - 1,
                                               dtype=tf.int32)

        with tf.device(
                tf.train.replica_device_setter(
                    ps_device='/job:ps/cpu:0',
                    worker_device=("/job:worker/task:%d" % FLAGS.task_index),
                    cluster=cluster)):

            global_step = tf.contrib.framework.get_or_create_global_step()

            #images, labels = cifar.distorted_inputs(FLAGS)
            logits, end_points = network_fn(images)
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=labels)
            cost = tf.reduce_mean(loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                cost, global_step=global_step)

        print_model()

        train_dir = tempfile.mkdtemp()

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=[
                "/job:ps", "/job:worker/task:%d" % FLAGS.task_index
            ],
            graph_options=tf.GraphOptions(
                optimizer_options=tf.OptimizerOptions(
                    opt_level=tf.OptimizerOptions.L1)),
            gpu_options=tf.GPUOptions(visible_device_list=""))

        if FLAGS.infer_shapes == True:
            sess_config.graph_options.infer_shapes = FLAGS.infer_shapes

        sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                 logdir=train_dir,
                                 init_op=tf.global_variables_initializer(),
                                 global_step=global_step,
                                 summary_writer=None,
                                 saver=None)

        if FLAGS.task_index == 0:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  FLAGS.task_index)

        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=sess_config,
                                              start_standard_services=True)

        print("Start warmup %d epoch." % FLAGS.warmup)
        for _ in range(FLAGS.warmup):
            sess.run(train_op)

        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        current_step = 0
        duration = 0
        while current_step < FLAGS.epoch:
            current_step += 1
            start_time = time.time()
            _, step_loss = sess.run([train_op, cost],
                                    options=options,
                                    run_metadata=run_metadata)
            end_time = time.time()
            print(
                "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds"
                % (current_step, step_loss, FLAGS.batch_size /
                   (end_time - start_time), end_time - start_time))
            duration += end_time - start_time

            if current_step == 3:
                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                chrome_trace = fetched_timeline.generate_chrome_trace_format()
                with open('timeline.json', 'w') as f:
                    f.write(chrome_trace)

        print("Total Time = %f s." % duration)
        #writer.close()

    else:
        sys.exit("Invalid job role name [%s]!" % args.job_name)
Example #32
0
def main(_):  
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    if FLAGS.job_name == "ps":
        ps_config = tf.ConfigProto(
                gpu_options=tf.GPUOptions(
                    per_process_gpu_memory_fraction=0.00001                
                ))

        # Create and start a server for the local task.
        server = tf.train.Server(cluster,
#                                 protocol = "grpc_rdma",
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index,
                                 config = ps_config)
        server.join()
    elif FLAGS.job_name == "worker":
        maybe_download_and_extract(FLAGS.data_dir, FLAGS.data_url)
        cifar.modify_flags(FLAGS)
        print (FLAGS.data_dir)      

        # Create and start a server for the local task.
        server = tf.train.Server(cluster,
#                                 protocol = "grpc_rdma",
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)

        local_worker_device = "/job:worker/task:%d" % FLAGS.task_index
        with tf.device(tf.train.replica_device_setter(
            ps_device='/job:ps/cpu:0',
            worker_device=local_worker_device,
            cluster=cluster)):
            
            if FLAGS.network == 'fc':
                from models.fullyconnect import KitModel
            elif FLAGS.network == 'cifar':
                from models.cifar import KitModel
            elif FLAGS.network == 'alexnet':
                from models.alexnet import KitModel
            elif FLAGS.network == 'vgg19' or FLAGS.network == 'vgg_e':
                from models.vgg19 import KitModel
            elif FLAGS.network == 'inception_v3' :
                from models.inception_v3 import KitModel
            elif FLAGS.network == 'resnet':                
                from models.resnet import KitModel
            else:
                sys.exit("Invalid network [%s]" % FLAGS.network)
      
            this_model = KitModel(FLAGS)
            images, labels = cifar.distorted_inputs(FLAGS) 
            logits = this_model.inference(images)
            loss = this_model.loss(labels)
            train_op = this_model.train()

        train_dir = tempfile.mkdtemp()
        
        sess_config = tf.ConfigProto(
            allow_soft_placement=True, 
            log_device_placement=False,
            device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index],

            graph_options=tf.GraphOptions(
                optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L1)
            ),

            gpu_options=tf.GPUOptions(
                visible_device_list=""
            )
        )

        if FLAGS.infer_shapes == True:
            sess_config.graph_options.infer_shapes = FLAGS.infer_shapes
         
        sv = tf.train.Supervisor(
            is_chief = (FLAGS.task_index == 0),
            logdir = train_dir,
            init_op = tf.global_variables_initializer(),
            global_step = this_model.global_step,
            summary_writer = None,
            saver = None)

        if FLAGS.task_index == 0:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index)
               
        sess = sv.prepare_or_wait_for_session(server.target, config = sess_config, start_standard_services = True)

        print_model()
       
        print ("Start warmup for %d mini-batch." % FLAGS.warmup)
        for _ in range(FLAGS.warmup):
            sess.run(this_model.train_op)

        current_step = 0
        current_epoch = 1
        duration = 0
        FLAGS.epoch = FLAGS.epoch * NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
        print ("Start Training for %d mini-batch." % FLAGS.epoch)
        while current_step < FLAGS.epoch:
            current_step += 1
            start_time = time.time()
            _, step_loss = sess.run([this_model.train_op, this_model.cost])
            end_time = time.time()
#            print("Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time))
            duration += end_time - start_time
            print("Time: %f seconds, step_loss:  %f" % (duration, step_loss))
            if current_step * FLAGS.batch_size > current_epoch * NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN:
                print ("Finish epoch %d" % (current_epoch))
                current_epoch += 1


        print ("Total Time = %f s." % duration)
        #writer.close()

    else:
        sys.exit("Invalid job role name [%s]!" % FLAGS.job_name)