def DownloadImage(key_url):
    out_dir = sys.argv[2]
    (key, url) = key_url
    if isinstance(key, tuple):
        filename = os.path.join(out_dir, str(key[1]), '%s.jpg' % key[0])
    else:
        filename = os.path.join(out_dir, 'test', '%s.jpg' % key)

    if os.path.exists(filename):
        print('Image %s already exists. Skipping download.' % filename)
        return
    else:
        try_makedirs(os.path.dirname(filename))

    try:
        # print('Trying to get %s.' % url)
        http = urllib3.PoolManager()
        response = http.request('GET', url, timeout=10)
        image_data = response.data
    except:
        print('Warning: Could not download image %s from %s' %
              (os.path.basename(filename), url))
        return

    try:
        pil_image = Image.open(BytesIO(image_data))
    except:
        print('Warning: Failed to parse image %s %s' %
              (os.path.basename(filename), url))
        return

    try:
        pil_image_rgb = pil_image.convert('RGB')
    except:
        print('Warning: Failed to convert image %s to RGB' %
              os.path.basename(filename))
        return

    try:
        pil_image_rgb.save(filename, format='JPEG', quality=90)
    except:
        print('Warning: Failed to save image %s' % filename)
        return
Ejemplo n.º 2
0
def train_one_dataset(params, file_name, train_q_data, train_qa_data, train_pid, valid_q_data,\
                      valid_qa_data, valid_pid, test_q_data, test_qa_data, test_pid):
    # ================================== model initialization ==================================

    model = load_model(params)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=params.lr,
                                 betas=(0.9, 0.999),
                                 eps=1e-8)

    print("\n")

    # total_params = sum(p.numel() for p in model.parameters())
    # print(f'{total_params:,} total parameters.')
    # total_trainable_params = sum(
    #     p.numel() for p in model.parameters() if p.requires_grad)
    # print(f'{total_trainable_params:,} training parameters.')

    # ================================== start training ==================================
    all_train_loss = {}
    all_train_accuracy = {}
    all_train_auc = {}
    all_valid_loss = {}
    all_valid_accuracy = {}
    all_valid_auc = {}
    all_test_loss = {}
    all_test_accuracy = {}
    all_test_auc = {}
    best_valid_auc = 0
    cur_train_auc = 0
    cur_test_auc = 0

    for idx in range(params.max_iter):
        # Train Model
        train_loss, train_accuracy, train_auc = train(model,
                                                      params,
                                                      optimizer,
                                                      train_q_data,
                                                      train_qa_data,
                                                      train_pid,
                                                      label='Train')
        # Validation step
        valid_loss, valid_accuracy, valid_auc = test(model,
                                                     params,
                                                     optimizer,
                                                     valid_q_data,
                                                     valid_qa_data,
                                                     valid_pid,
                                                     label='Valid')
        # Test step
        test_loss, test_accuracy, test_auc = test(model,
                                                  params,
                                                  optimizer,
                                                  test_q_data,
                                                  test_qa_data,
                                                  test_pid,
                                                  label='Test')

        print('epoch', idx + 1)
        print("\ttrain_auc\t", train_auc, "valid_auc\t", valid_auc,
              "\ttest_auc\t", test_auc)
        print("\ttrain_accuracy\t", train_accuracy, "valid_accuracy\t", valid_accuracy,\
              "\ttest_accuracy\t", test_accuracy)
        print("\ttrain_loss\t", train_loss, "valid_loss\t", valid_loss,
              "test_loss\t", test_loss)

        try_makedirs('model')
        try_makedirs(os.path.join('model', params.model))
        try_makedirs(os.path.join('model', params.model, params.save))

        # all_valid_auc[idx + 1] = valid_auc
        # all_train_auc[idx + 1] = train_auc
        # all_test_auc[idx + 1] = test_auc
        # all_valid_loss[idx + 1] = valid_loss
        # all_train_loss[idx + 1] = train_loss
        # all_test_loss[idx + 1] = test_loss
        # all_valid_accuracy[idx + 1] = valid_accuracy
        # all_train_accuracy[idx + 1] = train_accuracy
        # all_test_accuracy[idx + 1] = test_accuracy

        # output the epoch with the best validation auc
        if valid_auc > best_valid_auc:
            path = os.path.join('model', params.model, params.save,
                                file_name) + '_*'
            for i in glob.glob(path):
                os.remove(i)
            print(best_valid_auc, ' to ', valid_auc)
            best_valid_auc = valid_auc
            cur_train_auc = train_auc
            cur_test_auc = test_auc
            best_epoch = idx + 1
            torch.save(
                {
                    'epoch': idx,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': train_loss,
                },
                os.path.join('model', params.model, params.save, file_name) +
                '_' + str(idx + 1))
        if idx - best_epoch > 40:
            break

    print("cur_train_auc\t", cur_train_auc, "best_valid_auc\t", best_valid_auc, "\n", "cur_test_auc\t",\
          cur_test_auc)

    try_makedirs('result')
    try_makedirs(os.path.join('result', params.model))
    try_makedirs(os.path.join('result', params.model, params.save))
    f_save_log = open(
        os.path.join('result', params.model, params.save, file_name), 'w')
    f_save_log.write("valid_auc:\n" + str(all_valid_auc) + "\n\n")
    f_save_log.write("train_auc:\n" + str(all_train_auc) + "\n\n")
    f_save_log.write("test_auc:\n" + str(all_test_auc) + "\n\n")
    f_save_log.write("valid_loss:\n" + str(all_valid_loss) + "\n\n")
    f_save_log.write("train_loss:\n" + str(all_train_loss) + "\n\n")
    f_save_log.write("test_loss:\n" + str(all_test_loss) + "\n\n")
    f_save_log.write("valid_accuracy:\n" + str(all_valid_accuracy) + "\n\n")
    f_save_log.write("train_accuracy:\n" + str(all_train_accuracy) + "\n\n")
    f_save_log.write("test_accuracy:\n" + str(all_test_accuracy) + "\n\n")
    f_save_log.close()
    return best_epoch
Ejemplo n.º 3
0
def main():
    """Main function"""

    logger = get_logger(path.splitext(path.basename(__file__))[0] + '.log')
    logger.info('Greetings from ' + path.basename(__file__))

    validation_ratio = 6
    db_data_path = path.join(CWD, 'input')
    train_db_path = path.join(db_data_path, 'train_lmdb')
    validation_db_path = path.join(db_data_path, 'validation_lmdb')

    try_makedirs(db_data_path)

    if path.exists(train_db_path):
        logger.info('Removing ' + train_db_path)
        rmtree(train_db_path)
    if path.exists(validation_db_path):
        logger.info('Removing ' + validation_db_path)
        rmtree(validation_db_path)

    all_data_info = pd.read_csv(path.join(DATA_PATH, 'all_data_info.csv'))
    # Creating label, genre data frame
    # genres = pd.DataFrame({'genre': all_data_info['genre'].dropna().unique()})
    genres = pd.DataFrame(columns=['label', 'genre', 'amount'])
    genre_label = get_genre_labels()
    for i, data in enumerate(genre_label):
        amount = all_data_info[all_data_info['genre'].isin(data['addition'])]
        genres.loc[i] = [data['label'], data['genre'], len(amount)]
    genres.to_csv(path.join(DATA_PATH, 'genre_labels.csv'), index=False)

    logger.info('Creating train_db_path and validation_db_path')

    train_images = [img for img in glob(path.join(DATA_PATH, 'train', '*.jpg'))]
    shuffle(train_images)
    null_genre = 0
    null_label = 0
    generated_imgs = 0
    genre_label = get_genre_labels(True)
    train_db = lmdb.open(train_db_path, map_size=int(1e12))
    validation_db = lmdb.open(validation_db_path, map_size=int(1e12))
    with train_db.begin(write=True) as train_tnx, validation_db.begin(
            write=True) as validation_tnx:
        for in_idx, img_path in enumerate(train_images):
            # getting painting genre
            genre = all_data_info[all_data_info['new_filename'] ==
                                  path.basename(img_path)]['genre'].dropna()
            # some paintings don't have a genre. Checking it
            if len(genre) < 1:
                null_genre += 1
                continue
            if genre.values[0] not in genre_label:
                # No label. It's strange, but let's go on...
                null_label += 1
                logger.critical(str(genre.values[0]) + ' has no label!')
                continue
            label = genre_label[genre.values[0]]['label']
            imgs = generate_images(
                img_path,
                genres[genres['label'] == int(label)]['amount'].values[0])
            for i, img in enumerate(imgs):
                datum = make_datum(img, int(label))
                # with open('datum', 'w') as file:
                #     file.write(datum.SerializeToString())
                if (in_idx + generated_imgs + i) % validation_ratio != 0:
                    train_tnx.put('{:0>5d}'.format(in_idx),
                                  datum.SerializeToString())
                else:
                    validation_tnx.put('{:0>5d}'.format(in_idx),
                                       datum.SerializeToString())
            generated_imgs += len(imgs)
            logger.debug('{:0>5d}'.format(in_idx) + ':' + img_path + ' (+ ' +
                         str(len(imgs)) + ' augmented)')

            # printing progress and file name
            print(
                get_percentage(in_idx, len(train_images)) + str(label) + ' ' +
                path.basename(img_path))
    train_db.close()
    validation_db.close()

    logger.info('Genre is null: ' + str(null_genre))
    logger.info('Label is null: ' + str(null_label))
    logger.info('Finished processing all images')
    logger.info('Computing image mean')
    system('compute_image_mean -backend=lmdb ' + train_db_path + ' ' +
           path.join(db_data_path, 'mean.binaryproto'))
def train_and_predict(model_type, gpus):
    """
    Trains model and makes predictions file
    """
    # creating data generators
    train_datagen = ImageDataGenerator(rescale=1. / 255, horizontal_flip=True)
    test_datagen = ImageDataGenerator(rescale=1. / 255)
    train_generator = train_datagen.flow_from_directory(
        TRAIN_DATA_PATH,
        class_mode='binary',
        seed=171717,
        **config[model_type]['flow_generator'])
    validation_generator = test_datagen.flow_from_directory(
        VALIDATION_DATA_PATH,
        class_mode='binary',
        
        **config[model_type]['flow_generator'])
    test_generator = test_datagen.flow_from_directory(
        TEST_DATA_PATH,
        class_mode=None,
        classes=CLASSES,
        shuffle=False,
        **config[model_type]['flow_generator'])

    # loading the model
    parallel_model, model = get_model(model=model_type, gpus=gpus)
    print('Training model')
    print(model.summary())
    history = parallel_model.fit_generator(
        train_generator,
        validation_data=validation_generator,
        callbacks=[
            EarlyStopping(monitor='val_loss', min_delta=0, patience=5),
            ReduceLROnPlateau(
                monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001),
            TerminateOnNaN()
        ],
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count(),
        **config[model_type]['fit_generator'])
    # history of training
    # print(history.history.keys())
    # Saving architecture + weights + optimizer state
    model_path = path.join(MODELS_PATH, '{}_{:.4f}_{:.4f}'.format(
        model_type, history.history['val_loss'][-1]
        if 'val_loss' in history.history else history.history['loss'][-1],
        history.history['val_acc'][-1]
        if 'val_acc' in history.history else history.history['acc'][-1]))
    try_makedirs(model_path)
    plot_model(model, path.join(model_path, 'model.png'), show_shapes=True)
    plot_loss_acc(history, model_path)

    print('Saving model')
    model.save(path.join(model_path, 'model.h5'))
    # Building confusion matrices for every class for validation data
    print("Building confusion matrices")
    val_preds = model.predict_generator(
        validation_generator,
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count())
    plot_confusion_matrix(
        confusion_matrix(
            list(validation_generator.classes), np.argmax(val_preds, axis=1)),
        CLASSES, model_path)

    print('Generating predictions')
    predictions = model.predict_generator(
        test_generator,
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count())
    pred_classes = np.argmax(predictions)
    # Dealing with missing data
    ids = list(map(lambda id: id[5:-4], test_generator.filenames))
    proba = predictions[np.arange(len(predictions)), pred_classes]
    # Generating predictions.csv for Kaggle
    pd.DataFrame({
        'id': ids,
        'predicted': pred_classes,
    }).sort_values(by='id').to_csv(
        path.join(model_path, 'predictions.csv'), index=False)
    # Generating predictions.csv with some additional data for post-processing
    pd.DataFrame({
        'id': ids,
        'predicted': pred_classes,
        'proba': proba
    }).sort_values(by='id').to_csv(
        path.join(model_path, 'predictions_extd.csv'), index=False)
Ejemplo n.º 5
0
def train_one_dataset(params, file_name, train_s_data, train_sa_data,
                      train_eid, train_tid, train_fid, train_xid, train_yid,
                      valid_s_data, valid_sa_data, valid_eid, valid_tid,
                      valid_fid, valid_xid, valid_yid):
    # ================================== model initialization ==================================

    model = load_model(params)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=params.lr,
                                 betas=(0.9, 0.999),
                                 eps=1e-8)

    print("\n")

    # ================================== start training ==================================
    all_train_loss = {}
    all_train_acc = {}
    all_train_auc = {}
    all_valid_loss = {}
    all_valid_acc = {}
    all_valid_auc = {}
    best_valid_auc = 0

    for idx in range(params.max_iter):
        # Train Model
        train_loss, train_acc, train_auc = train(model,
                                                 params,
                                                 optimizer,
                                                 train_s_data,
                                                 train_sa_data,
                                                 train_eid,
                                                 train_tid,
                                                 train_fid,
                                                 train_xid,
                                                 train_yid,
                                                 label='Train')
        # Validation step
        valid_loss, valid_acc, valid_auc = test(model,
                                                params,
                                                optimizer,
                                                valid_s_data,
                                                valid_sa_data,
                                                valid_eid,
                                                valid_tid,
                                                valid_fid,
                                                valid_xid,
                                                valid_yid,
                                                label='Valid')

        print('epoch', idx + 1)
        print("valid_auc\t", valid_auc, "\ttrain_auc\t", train_auc)
        print("valid_acc\t", valid_acc, "\ttrain_acc\t", train_acc)
        print("valid_loss\t", valid_loss, "\ttrain_loss\t", train_loss)

        try_makedirs('model')
        try_makedirs(os.path.join('model', params.model))
        try_makedirs(os.path.join('model', params.model, params.save))

        all_valid_auc[idx + 1] = valid_auc
        all_train_auc[idx + 1] = train_auc
        all_valid_loss[idx + 1] = valid_loss
        all_train_loss[idx + 1] = train_loss
        all_valid_acc[idx + 1] = valid_acc
        all_train_acc[idx + 1] = train_acc

        # output the epoch with the best validation auc
        if valid_auc > best_valid_auc:
            path = os.path.join('model', params.model, params.save,
                                file_name) + '_*'
            for i in glob.glob(path):
                os.remove(i)
            best_valid_auc = valid_auc
            best_epoch = idx + 1
            torch.save(
                {
                    'epoch': idx,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': train_loss,
                },
                os.path.join('model', params.model, params.save, file_name) +
                '_' + str(idx + 1))
        if idx - best_epoch > 40:
            break

    try_makedirs('result')
    try_makedirs(os.path.join('result', params.model))
    try_makedirs(os.path.join('result', params.model, params.save))
    f_save_log = open(
        os.path.join('result', params.model, params.save, file_name), 'w')
    f_save_log.write("valid_auc:\n" + str(all_valid_auc) + "\n\n")
    f_save_log.write("train_auc:\n" + str(all_train_auc) + "\n\n")
    f_save_log.write("valid_loss:\n" + str(all_valid_loss) + "\n\n")
    f_save_log.write("train_loss:\n" + str(all_train_loss) + "\n\n")
    f_save_log.write("valid_acc:\n" + str(all_valid_acc) + "\n\n")
    f_save_log.write("train_acc:\n" + str(all_train_acc) + "\n\n")
    f_save_log.close()
    return best_epoch