Example #1
0
def main(argv):
    (opts, args) = parser.parse_args(argv)
    config = ConfigParser(opts.config)

    if torch.cuda.is_available():
        gpu_ids = np.array(config.general.gpu_ids.split(' ')).astype(np.int)
        device = torch.device('cuda:{0}'.format(gpu_ids[0]))
    else:
        device = torch.device('cpu')

    # torch.cuda.set_device(device)

    raw_df = pd.read_csv(config.dataset.raw_path, sep="\t")
    name_vectorizer = train_tf_idf(MIN_NAME_DF, 'name', raw_df)

    train_loader, dataset = init_dataset(config, DBType.Train, name_vectorizer, raw_df)

    current_iteration_path = os.path.join(config.general.output_path, config.general.current_iteration_file_name)

    if os.path.isfile(current_iteration_path):
        start_epoch, epoch_iteration = np.loadtxt(current_iteration_path, delimiter=',', dtype=int)
        print('resuming from epoch %d at iteration %d' % (start_epoch, epoch_iteration))
    else:
        start_epoch, epoch_iteration = 0, 0

    tmp_start = epoch_iteration
    model = Model(config, dataset)

    # model = torch.nn.DataParallel(model)
    model.train()

    dataset_size = len(dataset)
    logger = Logger(config)
    current_step = start_epoch * dataset_size + epoch_iteration

    steps_counter = 0
    accumulated_loss = 0
    freq_loss = 0

    evaluator = Evaluator(DBType.Validation, config, name_vectorizer, raw_df)
    raw_df = None

    # if start_epoch % config.train.lr_update_freq == 0:
    #     model.update_learning_rate()

    # if len(gpu_ids) > 1:
    #    model = nn.DataParallel(model)

    model.to(device)
    freq_start_time = time.time()

    current_eval = last_eval = 99999999
    tmp_count = 0

    for epoch in range(start_epoch, config.train.num_epochs):
        epoch_start_time = time.time()

        if epoch != start_epoch:
            epoch_iteration = 0

        for i, data in enumerate(train_loader, start=epoch_iteration):
            if steps_counter % 500 == 0:
                print('{} / {}'.format(epoch_iteration, dataset_size))

            current_step += config.train.batch_size
            epoch_iteration += config.train.batch_size

            name = data['name'].to(device)
            cid = data['cid'].to(device)
            c_name = data['c_name'].to(device)
            b_name = data['b_name'].to(device)
            price = data['price'].to(device).unsqueeze(1)
            shipping = data['shipping'].to(device)
            desc = data['desc'].to(device)
            desc_len = data['desc_len'].to(device)

            loss = model(name, cid, c_name, b_name, shipping, desc, desc_len, price)
            loss = torch.mean(loss)

            model.optimizer.zero_grad()
            loss.backward()

            if config.general.clip_grads:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)

            model.optimizer.step()

            accumulated_loss += loss.item()
            freq_loss += loss.item()

            if (steps_counter % config.general.print_logs_freq == 0) and steps_counter != 0:
                freq_loss = freq_loss / config.general.print_logs_freq

                print('freq_loss {}. time {}'.format(freq_loss, time.time() - freq_start_time))

                losses_dict = {'loss': loss.item(), 'freq_loss': freq_loss}
                logger.dump_current_errors(losses_dict, current_step)

                freq_loss = 0
                freq_start_time = time.time()

            if (steps_counter % config.general.save_checkpoint_freq == 0) and steps_counter != 0:
                print('========== saving model (epoch %d, total_steps %d) =========' % (epoch, current_step))
                model.save('latest')
                np.savetxt(current_iteration_path, (epoch, epoch_iteration), delimiter=',', fmt='%d')

            steps_counter += 1

        print('end of epoch %d / %d \t time taken: %d sec' %
              (epoch, config.train.num_epochs, time.time() - epoch_start_time))

        accumulated_loss = accumulated_loss / (i + 1 - tmp_start)
        tmp_start = 0

        print('accumulated loss {}'.format(accumulated_loss))

        losses_dict = {'accumulated_loss': accumulated_loss}
        logger.dump_current_errors(losses_dict, current_step)

        accumulated_loss = 0
        model.save('latest')
        model.save(str(epoch))

        np.savetxt(current_iteration_path, (epoch + 1, 0), delimiter=',', fmt='%d')

        # if epoch % config.general.eval_epcohs_freq == 0:
        current_eval = evaluator.eval(model, max_iterations=config.train.max_eval_iterations)

        # if epoch % config.train.lr_update_freq == 0:
        if current_eval > last_eval:
            tmp_count += 1

            if tmp_count == 3:
                model.update_learning_rate()
                tmp_count = 0
                last_eval = current_eval
        else:
            tmp_count = 0
            last_eval = current_eval
Example #2
0
def main():

    tensorboard_directory = './tmp/tensorboard/001'
    tensorboard_paths = [
        r'C:\Users\parth\Documents\GitHub\Kaggle-Santander-Value-Prediction-Challenge\tmp\tensorboard\001'
    ]
    tensorboard_names = ['rmse']

    # Model Parameters
    # --------------------------------------------------------------------------

    use_dropout = False
    use_batch_norm = False

    # Dropout inputs
    #     use : to use dropout in this layer
    #     rate : dropout rate
    dropout_parameters = [{
        'use': True,
        'rate': 0.5
    }, {
        'use': True,
        'rate': 0.5
    }, {
        'use': True,
        'rate': 0.5
    }, {
        'use': True,
        'rate': 0.5
    }]

    # Fully Connected Layers unit size
    fc_parameters = [{
        'units': 5000
    }, {
        'units': 5000
    }, {
        'units': 5000
    }, {
        'units': 5000
    }]
    num_dense = len(fc_parameters)

    data_shape = [None, 4990]
    batch_size = 500
    val_size = 5000
    epochs = 100000
    learning_rate = 0.001
    session = tf.Session()

    Tensorboard.make(paths=tensorboard_paths,
                     names=tensorboard_names,
                     host='127.0.0.1',
                     port='6006',
                     output=True,
                     start=False)

    dropout_parameters = []

    model = Model(sess=session,
                  data_shape=data_shape,
                  num_classes=1,
                  num_dense=2,
                  learning_rate=learning_rate,
                  use_batch_norm=use_batch_norm,
                  use_dropout=use_dropout,
                  dropout_parameters=dropout_parameters,
                  fc_parameters=fc_parameters,
                  tensorboard_directory=tensorboard_directory)

    train_data, train_labels = get_data()
    train_data, val_data, train_labels, val_labels = train_test_split(
        train_data, train_labels, test_size=0.30)

    print('> Training Data: {} {}'.format(train_data.shape,
                                          train_labels.shape))
    print('> Val Data: {} {}'.format(val_data.shape, val_labels.shape))
    # print('> Test Data: {} {}'.format(test_data.shape, test_labels.shape))

    model.train_data(data=train_data, labels=train_labels)

    model.val_data(data=val_data, labels=val_labels)

    model.train(batch_size=batch_size, epochs=epochs)