def run_cross_validation(self, locs_in=["SAB"]):

        print(" \n\n--- Cross Validation for locations: {}\n".format(locs_in))

        locations = utilities.get_locations(locs_in)

        data_in = self.data.get_data()
        data_in = data_in[data_in["Environment"].isin(locations)]

        label_encoder, pipeline = utilities.prepare_skl_interface(
            data_in, self.classifier)

        # shuffle with random seed if specified
        if self.random_state is not None:
            data_in_shuffled = skl.utils.shuffle(
                data_in, random_state=self.random_state)
        else:
            data_in_shuffled = skl.utils.shuffle(data_in)

        # get metrics
        output_metrics = utilities.cross_validation(pipeline,
                                                    self.folds,
                                                    data_in_shuffled,
                                                    label_encoder,
                                                    self.srp_dict,
                                                    data_aug=self.data_aug)

        metrics.print_metrics(output_metrics, self.paper_metrics_only)
Ejemplo n.º 2
0
def test_with_svm(dataset_test, classifier, preprocessing, pca_processing,
                  show_testing_metrics, labels, names_test, names) -> List:

    # Apply PCA/KPCA transformation to testing training_data
    dataset_test_pca = preprocess_dataset(pca_processing, preprocessing,
                                          dataset_test)
    labels_test_mapped_to_labels_train = []

    testing_with_training_dataset = True
    for label in labels:
        try:
            label_mapped = list(names).index(names_test[label])
        except:
            # If name is not in training dataset, then label is not mapped
            label_mapped = label
            # We can assume that user is not testing the dataset
            testing_with_training_dataset = False
            show_testing_metrics = False
        labels_test_mapped_to_labels_train.append(label_mapped)

    sc = StandardScaler()
    scaled_dataset_test_pca = sc.fit_transform(dataset_test_pca)

    # Test classifier
    y_pred = classifier.predict(scaled_dataset_test_pca)
    # classifier.save(preprocessing, pca_processing)

    # To obtain metrics
    print_metrics(y_pred, names, labels, labels_test_mapped_to_labels_train,
                  names_test, testing_with_training_dataset,
                  show_testing_metrics)

    return [names[int(y_pred[i])] for i in range(len(y_pred))]
Ejemplo n.º 3
0
def main():
    # Add seed
    random_seed = 42
    torch.manual_seed(random_seed)
    args = parser.get()
    X_train = load('./datas/X_train.npy')
    y_train = load('./datas/y_train.npy')
    X_test = load('./datas/X_test.npy')
    train_dataset = data.DatasetXy(X_train, y_train)
    test_dataset = data.DatasetX(X_test)
    data_class = data.Dataloader(args, train_dataset, test_dataset)

    train, test = data_class.train(), data_class.test()

    model = models.get(args)
    optimizer = optimizers.get(args, model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(args.epochs):
        train_metrics = runner.run(
            model,
            criterion,
            optimizer,
            train,
            True,
            {
                "loss": metrics.loss,
                "accuracy": metrics.accuracy
            },
        )
        metrics.print_metrics(train_metrics)

    y_test_pred = runner.run(
        model,
        criterion,
        optimizer,
        test,
        False,
        {
            "loss": metrics.loss,
            "accuracy": metrics.accuracy
        },
    )

    print(y_test_pred)
    y_test_pred = [item for sublist in y_test_pred for item in sublist]
    #print((y_test_pred[0]).shape)
    #_, y_pred = torch.max(y_test_pred, dim = 1)

    #y_pred = torch.round(y_test_pred)
    # _, y_pred = torch.max(y_test_pred, dim = 1)
    # y_pred = y_pred.cpu().numpy()
    #print(len(y_pred_list))
    #print(y_pred.type)
    y_test = np.asarray(y_test_pred)
    pd.DataFrame({
        "Id": np.arange(len(y_test)),
        "Category": y_test
    }).astype(int).to_csv("solution.csv", index=False)
Ejemplo n.º 4
0
def svm():
    # *********************    load the dataset and divide to X&y   ***********************
    from sklearn.datasets import make_blobs
    X, Y = make_blobs(cluster_std=0.9,
                      random_state=20,
                      n_samples=1000,
                      centers=10,
                      n_features=10)

    from Algorithms.ML_.helper.data_helper import split_train_val_test
    X, Xv, y, Yv, Xt, Yt = split_train_val_test(X, Y)
    print(X.shape, y.shape, Xv.shape, Yv.shape, Xt.shape, Yt.shape)

    # *********************   build model    ***********************
    from model import SVM
    from activation import Activation, Softmax, Hinge
    from regularization import Regularization, L1, L2, L12
    from optimizer import Vanilla
    model = SVM()
    learning_rate, reg_rate = 1e-3, 5e-1
    model.compile(alpha=learning_rate,
                  lambda_=reg_rate,
                  activation=Softmax(),
                  reg=L2(),
                  opt=Vanilla())
    model.describe()
    # *********************    train   ***********************
    loss_train, loss_val = model.train(X,
                                       y,
                                       val=(Xv, Yv),
                                       iter_=1000,
                                       return_loss=True,
                                       verbose=True,
                                       eps=1e-3)
    import matplotlib.pyplot as plt
    plt.plot(range(len(loss_train)), loss_train)
    plt.plot(range(len(loss_val)), loss_val)
    plt.legend(['train', 'val'])
    plt.xlabel('Iteration')
    plt.ylabel('Training loss')
    plt.title('Training Loss history')
    plt.show()
    # *********************    predict   ***********************
    pred_train = model.predict(X)
    pred_val = model.predict(Xv)
    pred_test = model.predict(Xt)

    import metrics

    print('train accuracy=', metrics.accuracy(y, pred_train))
    print('val accuracy=', metrics.accuracy(Yv, pred_val))
    print('test accuracy=', metrics.accuracy(Yt, pred_test))
    print('null accuracy=', metrics.null_accuracy(y))
    import metrics
    metrics.print_metrics(Yt, pred_test)
    def run_generalisation(self,
                           train_locs=["DA"],
                           test_locs=["DB"],
                           save_classifier=True):

        print("\n\n --- Generalization across locations --- \n")
        print("Locations in train set: {}".format(train_locs))
        print("Locations in test set: {}".format(test_locs))

        train_locs = utilities.get_locations(train_locs)
        test_locs = utilities.get_locations(test_locs)

        data_in = self.data.get_data()
        label_encoder, pipeline = utilities.prepare_skl_interface(
            data_in, self.classifier)

        train_data = data_in[data_in["Environment"].isin(train_locs)]
        test_data = data_in[data_in["Environment"].isin(test_locs)]

        accuracy, conf_mat = utilities.train_and_test(
            train_data,
            test_data,
            pipeline,
            label_encoder,
            self.srp_dict,
            save_cls=save_classifier,
            out_folder=self.output_folder)

        all_metrics = {
            "overall_accuracy": (accuracy, 0),
            "per_class_accuracy":
            (metrics.getPCaccuracy(conf_mat), np.zeros(4)),
            "per_class_precision":
            (metrics.getPCPrecision(conf_mat), np.zeros(4)),
            "per_class_recall": (metrics.getPCRecall(conf_mat), np.zeros(4)),
            "per_class_iou": (metrics.getPCIoU(conf_mat), np.zeros(4))
        }

        metrics.print_metrics(all_metrics, self.paper_metrics_only)
Ejemplo n.º 6
0
def main():
    # Add seed
    args = parser.get()

    data_class = data.Dataset(args)
    train, validation = data_class.train(), data_class.validation()

    model = models.get(args)
    optimizer = optimizers.get(args, model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(args.epochs):
        train_metrics = runner.run(
            model,
            criterion,
            optimizer,
            train,
            True,
            {
                "loss": metrics.loss,
                "accuracy": metrics.accuracy
            },
        )
        metrics.print_metrics(train_metrics)
        validation_metrics = runner.run(
            model,
            criterion,
            optimizer,
            validation,
            False,
            {
                "loss": metrics.loss,
                "accuracy": metrics.accuracy
            },
        )
        metrics.print_metrics(validation_metrics)
Ejemplo n.º 7
0
def test_with_svm(dataset_test, classifier, preprocessing, pca_processing,
                  show_testing_metrics, labels_test, labels_train, names_test,
                  names):
    # Apply PCA transformation to testing training_data
    dataset_test_pca = preprocess_dataset(pca_processing, preprocessing,
                                          dataset_test)

    labels_test_mapped_to_labels_train = []

    testing_with_training_dataset = True
    for label in labels_test:
        try:
            label_mapped = list(names).index(names_test[label])
        except:
            # If name is not in training dataset, then label is not mapped
            label_mapped = label
            # We can assume that user is not testing the dataset
            testing_with_training_dataset = False
            show_testing_metrics = False
        labels_test_mapped_to_labels_train.append(label_mapped)

    print(f"Shape of test set {dataset_test_pca}")
    # Test classifier
    y_pred = classifier.predict(dataset_test_pca)
    # classifier.save(preprocessing, pca_processing)

    # dataset_test = np.array(dataset_test_pca)
    # for i in range(dataset_test.shape[0]):
    #     pca_processing.reconstruct_image(dataset_test[i], names_test[labels_test[i]], names[y_pred[i]])

    # To obtain metrics
    print_metrics(y_pred, names, labels_test,
                  labels_test_mapped_to_labels_train, names_test,
                  testing_with_training_dataset, show_testing_metrics)

    return [names[int(y_pred[i])] for i in range(len(y_pred))]
Ejemplo n.º 8
0
    predictions = model.predict_generator(
        ds.generate_test(BATCH_SIZE),
        steps=ds.num_batches_test(BATCH_SIZE),
        workers=7,
        use_multiprocessing=False,
        max_queue_size=BATCH_SIZE,
        verbose=1)

    # Get test targets from generator
    y_set = None
    for x, y in ds.generate_test(BATCH_SIZE):
        y_set = np.array(y) if y_set is None else np.vstack((y_set, y))

    # Compute all the metrics
    metrics = compute_metrics(y_set, predictions, ds.num_classes)

    # Print metrics
    print_metrics(metrics)

    # Write test results to file
    with results_file_path.open('a') as f:
        f.write(F"\n{metrics_to_string(metrics)},{time.time() - start_time}")

    # Write confussion matrix to text file
    with cm_file_path.open('a') as f:
        f.write(f"===== Fold {fold} ======\n\n")
        f.write(str(metrics['Confusion matrix']))
        f.write("\n\n\n\n")

    del model
Ejemplo n.º 9
0
def regression():
    # *********************    load the dataset and divide to X&y   ***********************
    from sklearn.datasets import make_blobs
    X, Y = make_blobs(cluster_std=0.9,
                      random_state=20,
                      n_samples=1000,
                      centers=10,
                      n_features=10)

    from Algorithms.ML_.helper.data_helper import split_train_val_test
    X, Xv, y, Yv, Xt, Yt = split_train_val_test(X, Y)
    print(X.shape, y.shape, Xv.shape, Yv.shape, Xt.shape, Yt.shape)

    # *********************   build model    ***********************
    from model import Regression
    from layer import Layer, Dense
    from activation import Activation, Softmax, Sigmoid, ReLU
    from regularization import Regularization, L1, L2, L12
    from optimizer import Vanilla
    model = Regression()
    input_size = X.shape[1]
    hidden_size = 50
    num_classes = 10
    learning_rate, reg_rate = 1e-3, 0.5
    model = Regression([
        Dense(hidden_size,
              input_shape=(input_size, ),
              activation=ReLU(),
              alpha=learning_rate,
              lambda_=reg_rate),
    ])
    model += Dense(num_classes,
                   activation=Softmax(),
                   alpha=learning_rate,
                   lambda_=reg_rate)  # add layer with +=
    model.compile()
    model.describe()
    # *********************    train   ***********************
    loss_train, loss_val = model.train(X,
                                       y,
                                       val=(Xv, Yv),
                                       iter_=5000,
                                       batch=32,
                                       return_loss=True,
                                       verbose=True)

    import matplotlib.pyplot as plt
    plt.plot(range(len(loss_train)), loss_train)
    plt.plot(range(len(loss_val)), loss_val)
    plt.legend(['train', 'val'])
    plt.xlabel('Iteration')
    plt.ylabel('Training loss')
    plt.title('Training Loss history')
    plt.show()
    # *********************    predict   ***********************
    pred_train = model.predict(X)
    pred_val = model.predict(Xv)
    pred_test = model.predict(Xt)

    import metrics
    print('train accuracy=', metrics.accuracy(y, pred_train))
    print('val accuracy=', metrics.accuracy(Yv, pred_val))
    print('test accuracy=', metrics.accuracy(Yt, pred_test))
    print('null accuracy=', metrics.null_accuracy(y))
    import metrics
    metrics.print_metrics(Yt, pred_test)
Ejemplo n.º 10
0
#train data
image = misc.imread("pics/cat.jpg", flatten=True, mode="L")
image_list = [image]
image_list = tuple(image_list)

#train
auto = LSTMAutoencoder()
auto.train(image_list, block_size, layer_step, number_of_layers, epoches,
           batch_size, split_rate)
auto.save_models(num=str(num_try))
# auto.load_models(num = str(num_try))

#test
image = misc.imread("pics/lena.jpg", flatten=True, mode="L")
r = auto.encode(image)
s = auto.decode(r)

result = Image.fromarray((s * 255).astype(np.uint8))
result.save(path_to_result + "decoded.jpg")
misc.imsave(path_to_result + 'using_imsave.jpg', s)

r = np.array(r)
s = np.array(s)

print_metrics(image, r, s)
write_to_file(path_to_result, image, r, s, len(image_list), block_size,
              layer_step, number_of_layers, epoches, batch_size, split_rate)

num_try += 1
Ejemplo n.º 11
0
def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    result_fname = kwargs['output']
    embeds_fname = kwargs['embeds']
    logger_fname = kwargs['logger']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    warm_start = kwargs['warm_start']
    format_embeds = kwargs['format_embeds']
    config = kwargs['config']
    train_clear = kwargs['train_clear']
    test_clear = kwargs['test_clear']
    output_dir = kwargs['output_dir']
    norm_prob = kwargs['norm_prob']
    norm_prob_koef = kwargs['norm_prob_koef']
    gpus = kwargs['gpus']

    model_file = {
        'dense': os.path.join(output_dir, 'dense.h5'),
        'cnn': os.path.join(output_dir, 'cnn.h5'),
        'lstm': os.path.join(output_dir, 'lstm.h5'),
        'concat': os.path.join(output_dir, 'concat.h5'),
        'lr': os.path.join(output_dir, '{}_logreg.bin'),
        'catboost': os.path.join(output_dir, '{}_catboost.bin')
    }

    # ====Create logger====
    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    swear_words = load_data(swear_words_fname,
                            func=lambda x: set(x.T[0]),
                            header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    tokinizer = RegexpTokenizer(r'\w+')
    regexps = [
        re.compile("([a-zA-Z]+)([0-9]+)"),
        re.compile("([0-9]+)([a-zA-Z]+)")
    ]

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    embed_dim = 300
    embeds = Embeds(embeds_fname, 'fasttext', format=format_embeds)

    # ====Clean texts====
    logger.info('Cleaning text...')
    if warm_start:
        logger.info('Use warm start...')
    else:
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    tokinizer,
                                                    wrong_words_dict,
                                                    swear_words, regexps)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   tokinizer, wrong_words_dict,
                                                   swear_words, regexps)
        train_df.to_csv(train_clear, index=False)
        test_df.to_csv(test_clear, index=False)

    # ====Calculate maximum seq length====
    logger.info('Calc text length...')
    train_df.fillna('unknown', inplace=True)
    test_df.fillna('unknown', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')
    max_words = 100000

    train_df['comment_seq'], test_df[
        'comment_seq'], word_index = convert_text2seq(
            train_df['comment_text_clear'].tolist(),
            test_df['comment_text_clear'].tolist(),
            max_words,
            max_seq_len,
            lower=True,
            char_level=False,
            uniq=True)
    logger.debug('Dictionary size = {}'.format(len(word_index)))

    logger.info('Preparing embedding matrix...')
    embedding_matrix, words_not_found = get_embedding_matrix(
        embed_dim, embeds, max_words, word_index)
    logger.debug('Embedding matrix shape = {}'.format(
        np.shape(embedding_matrix)))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embedding_matrix, axis=1) == 0)))

    logger.info('Deleting unknown words from seq...')
    train_df['comment_seq'] = clean_seq(train_df['comment_seq'],
                                        embedding_matrix, max_seq_len)
    test_df['comment_seq'] = clean_seq(test_df['comment_seq'],
                                       embedding_matrix, max_seq_len)

    # ====Train/test split data====
    x = np.array(train_df['comment_seq'].tolist())
    y = np.array(train_df[target_labels].values)
    x_train_nn, x_test_nn, y_train_nn, y_test_nn, train_idxs, test_idxs = split_data(
        x, y, test_size=0.2, shuffle=True, random_state=42)
    test_df_seq = np.array(test_df['comment_seq'].tolist())
    y_nn = []
    logger.debug('X shape = {}'.format(np.shape(x_train_nn)))

    # ====Train models====

    params = Params(config)

    cnn = get_cnn(embedding_matrix,
                  num_classes,
                  max_seq_len,
                  num_filters=params.get('cnn').get('num_filters'),
                  l2_weight_decay=params.get('cnn').get('l2_weight_decay'),
                  dropout_val=params.get('cnn').get('dropout_val'),
                  dense_dim=params.get('cnn').get('dense_dim'),
                  add_sigmoid=True,
                  train_embeds=params.get('cnn').get('train_embeds'),
                  gpus=gpus)
    lstm = get_lstm(embedding_matrix,
                    num_classes,
                    max_seq_len,
                    l2_weight_decay=params.get('lstm').get('l2_weight_decay'),
                    lstm_dim=params.get('lstm').get('lstm_dim'),
                    dropout_val=params.get('lstm').get('dropout_val'),
                    dense_dim=params.get('lstm').get('dense_dim'),
                    add_sigmoid=True,
                    train_embeds=params.get('lstm').get('train_embeds'),
                    gpus=gpus)
    concat = get_concat_model(
        embedding_matrix,
        num_classes,
        max_seq_len,
        n_layers=params.get('concat').get('n_layers'),
        concat=params.get('concat').get('concat'),
        pool=params.get('concat').get('pool'),
        num_filters=params.get('concat').get('num_filters'),
        l2_weight_decay=params.get('concat').get('l2_weight_decay'),
        lstm_dim=params.get('concat').get('lstm_dim'),
        dropout_val=params.get('concat').get('dropout_val'),
        dense_dim=params.get('concat').get('dense_dim'),
        add_sigmoid=True,
        train_embeds=params.get('concat').get('train_embeds'),
        gpus=gpus)

    models = []
    for model_label in params.get('models'):
        if model_label == 'cnn':
            models.append([model_label, cnn])
        elif model_label == 'dense':
            models.append([model_label, dense])
        elif model_label == 'lstm':
            models.append([model_label, lstm])
        elif model_label == 'concat':
            models.append([model_label, concat])
        else:
            raise ValueError(
                'Invalid model {}. Model hasn`t defined.'.format(model_label))

    for i in range(len(models)):
        model_label, model = models[i]
        logger.info("training {} ...".format(model_label))
        if params.get(model_label).get('warm_start') and os.path.exists(
                params.get(model_label).get('model_file')):
            logger.info('{} warm starting...'.format(model_label))
            model = load_model(params.get(model_label).get('model_file'))
            models[i][1] = model
        else:
            hist = train(
                x_train_nn,
                y_train_nn,
                model,
                batch_size=params.get(model_label).get('batch_size'),
                num_epochs=params.get(model_label).get('num_epochs'),
                learning_rate=params.get(model_label).get('learning_rate'),
                early_stopping_delta=params.get(model_label).get(
                    'early_stopping_delta'),
                early_stopping_epochs=params.get(model_label).get(
                    'early_stopping_epochs'),
                use_lr_strategy=params.get(model_label).get('use_lr_strategy'),
                lr_drop_koef=params.get(model_label).get('lr_drop_koef'),
                epochs_to_drop=params.get(model_label).get('epochs_to_drop'),
                logger=logger)
        y_nn.append(model.predict(x_test_nn))
        save_predictions(test_df, model.predict(test_df_seq), target_labels,
                         model_label)
        metrics = get_metrics(y_test_nn, y_nn[-1], target_labels)
        logger.debug('{} metrics:\n{}'.format(model_label,
                                              print_metrics(metrics)))
        logger.debug('Model path = {}'.format(model_file[model_label]))
        model.save(model_file[model_label])

    # TFIDF + LogReg
    logger.info('training LogReg over tfidf...')
    train_tfidf, val_tfidf, test_tfidf, word_tfidf, char_tfidf = get_tfidf(
        train_df['comment_text_clear'].values[train_idxs],
        train_df['comment_text_clear'].values[test_idxs],
        test_df['comment_text_clear'].values)

    models_lr = []
    metrics_lr = {}
    y_tfidf = []
    for i, label in enumerate(target_labels):
        model = LogisticRegression(C=4.0,
                                   solver='sag',
                                   max_iter=1000,
                                   n_jobs=16)
        model.fit(train_tfidf, y_train_nn[:, i])
        y_tfidf.append(model.predict_proba(val_tfidf)[:, 1])
        test_df['tfidf_{}'.format(label)] = model.predict_proba(test_tfidf)[:,
                                                                            1]
        metrics_lr[label] = calc_metrics(y_test_nn[:, i], y_tfidf[-1])
        models_lr.append(model)
        joblib.dump(model, model_file['lr'].format(label))
    metrics_lr['Avg'] = {
        'Logloss':
        np.mean([metric['Logloss'] for label, metric in metrics_lr.items()])
    }
    logger.debug('LogReg(TFIDF) metrics:\n{}'.format(
        print_metrics(metrics_lr)))

    # Bow for catboost
    if params.get('catboost').get('add_bow'):
        top_pos_words = []
        top_neg_words = []
        for i in range(num_classes):
            top_pos_words.append([])
            top_neg_words.append([])
            top_pos_words[-1], top_neg_words[
                -1] = get_most_informative_features(
                    [word_tfidf, char_tfidf],
                    models_lr[i],
                    n=params.get('catboost').get('bow_top'))

        top_pos_words = list(
            set(
                np.concatenate([[val for score, val in top]
                                for top in top_pos_words])))
        top_neg_words = list(
            set(
                np.concatenate([[val for score, val in top]
                                for top in top_neg_words])))
        top = list(set(np.concatenate([top_pos_words, top_neg_words])))
        train_bow = get_bow(train_df['comment_text_clear'].values[train_idxs],
                            top)
        val_bow = get_bow(train_df['comment_text_clear'].values[test_idxs],
                          top)
        test_bow = get_bow(test_df['comment_text_clear'].values, top)
        logger.debug('Count bow words = {}'.format(len(top)))

    # Meta catboost
    logger.info('training catboost as metamodel...')
    train_df['text_unique_len'] = train_df['comment_text_clear'].apply(
        calc_text_uniq_words)
    test_df['text_unique_len'] = test_df['comment_text_clear'].apply(
        calc_text_uniq_words)

    train_df['text_unique_koef'] = train_df['text_unique_len'] / train_df[
        'text_len']
    test_df[
        'text_unique_koef'] = test_df['text_unique_len'] / test_df['text_len']

    text_len_features = train_df[[
        'text_len', 'text_unique_len', 'text_unique_koef'
    ]].values[test_idxs]

    x_train_catboost = []
    y_train_catboost = y_test_nn
    features = y_nn
    features.extend([text_len_features, np.array(y_tfidf).T])
    if params.get('catboost').get('add_bow'):
        features.append(val_bow)
    for feature in zip(*features):
        x_train_catboost.append(np.concatenate(feature))

    models_cb = []
    metrics_cb = {}
    x_train_cb, x_val_cb, y_train_cb, y_val_cb = train_test_split(
        x_train_catboost, y_train_catboost, test_size=0.20, random_state=42)
    for i, label in enumerate(target_labels):
        model = CatBoostClassifier(
            loss_function='Logloss',
            iterations=params.get('catboost').get('iterations'),
            depth=params.get('catboost').get('depth'),
            rsm=params.get('catboost').get('rsm'),
            learning_rate=params.get('catboost').get('learning_rate'),
            device_config=params.get('catboost').get('device_config'))
        model.fit(x_train_cb,
                  y_train_cb[:, i],
                  eval_set=(x_val_cb, y_val_cb[:, i]),
                  use_best_model=True)
        y_hat_cb = model.predict_proba(x_val_cb)
        metrics_cb[label] = calc_metrics(y_val_cb[:, i], y_hat_cb[:, 1])
        models_cb.append(model)
        joblib.dump(model, model_file['catboost'].format(label))
    metrics_cb['Avg'] = {
        'Logloss':
        np.mean([metric['Logloss'] for label, metric in metrics_cb.items()])
    }
    logger.debug('CatBoost metrics:\n{}'.format(print_metrics(metrics_cb)))

    # ====Predict====
    logger.info('Applying models...')
    text_len_features = test_df[[
        'text_len', 'text_unique_len', 'text_unique_koef'
    ]].values
    y_tfidf_test = test_df[[
        'tfidf_{}'.format(label) for label in target_labels
    ]].values
    x_test_cb = []
    features = []
    for model_label, _ in models:
        features.append(test_df[[
            '{}_{}'.format(model_label, label) for label in target_labels
        ]].values)
    features.extend([text_len_features, y_tfidf_test])
    if params.get('catboost').get('add_bow'):
        features.append(test_bow)
    for feature in tqdm(zip(*features)):
        x_test_cb.append(np.concatenate(feature))

    for label, model in zip(target_labels, models_cb):
        pred = model.predict_proba(x_test_cb)
        test_df[label] = np.array(list(pred))[:, 1]

    # ====Normalize probabilities====
    if norm_prob:
        for label in target_labels:
            test_df[label] = norm_prob_koef * test_df[label]

    # ====Save results====
    logger.info('Saving results...')
    test_df[[
        'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate'
    ]].to_csv(result_fname, index=False, header=True)
Ejemplo n.º 12
0
def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    result_fname = kwargs['output']
    word_embeds_fname = kwargs['word_embeds']
    char_embeds_fname = kwargs['char_embeds']
    logger_fname = kwargs['logger']
    mode = kwargs['mode']
    max_words = kwargs['max_words']
    use_only_exists_words = kwargs['use_only_exists_words']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    embeds_format = kwargs['format_embeds']
    config = kwargs['config']
    output_dir = kwargs['output_dir']
    norm_prob = kwargs['norm_prob']
    norm_prob_koef = kwargs['norm_prob_koef']
    gpus = kwargs['gpus']

    seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))
    seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))

    model_file = {
        'dense': os.path.join(output_dir, 'dense.h5'),
        'cnn': os.path.join(output_dir, 'cnn.h5'),
        'lstm': os.path.join(output_dir, 'lstm.h5'),
        'lr': os.path.join(output_dir, '{}_logreg.bin'),
        'catboost': os.path.join(output_dir, '{}_catboost.bin')
    }

    # ====Create logger====
    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    swear_words = load_data(swear_words_fname,
                            func=lambda x: set(x.T[0]),
                            header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    embeds_word = Embeds().load(word_embeds_fname, embeds_format)
    embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format)

    # ====Clean texts====
    if mode in ('preprocess', 'all'):
        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    wrong_words_dict,
                                                    autocorrect=True)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   wrong_words_dict,
                                                   autocorrect=True)
        train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False)

    # ====Calculate maximum seq length====
    logger.info('Calc text length...')
    train_df.fillna('__NA__', inplace=True)
    test_df.fillna('__NA__', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    max_char_seq_len = 2000  # empirical
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')

    if mode in ('preprocess', 'all'):
        train_df[seq_col_name_words], test_df[
            seq_col_name_words], word_index, train_df[
                seq_col_name_ll3], test_df[
                    seq_col_name_ll3], ll3_index = convert_text2seq(
                        train_df['comment_text_clear'].tolist(),
                        test_df['comment_text_clear'].tolist(),
                        max_words,
                        max_seq_len,
                        max_char_seq_len,
                        embeds_word,
                        lower=True,
                        oov_token='__NA__',
                        uniq=False,
                        use_only_exists_words=use_only_exists_words)
        logger.debug('Dictionary size use_exist{} = {}'.format(
            int(use_only_exists_words), len(word_index)))
        logger.debug('Char dict size use_exist{} = {}'.format(
            int(use_only_exists_words), len(ll3_index)))

        logger.info('Preparing embedding matrix...')
        words_not_found = embeds_word.set_matrix(max_words, word_index)
        embeds_ll3.matrix = np.random.normal(size=(len(ll3_index),
                                                   embeds_word.shape[1]))
        embeds_ll3.word_index = ll3_index
        embeds_ll3.word_index_reverse = {
            val: key
            for key, val in ll3_index.items()
        }
        embeds_ll3.shape = np.shape(embeds_ll3.matrix)
        embeds_word.save(
            os.path.join(output_dir,
                         'wiki.embeds_lw.{}k'.format(int(max_words / 1000))))
        embeds_ll3.save(
            os.path.join(output_dir,
                         'wiki.embeds_ll3.{}k'.format(int(max_words / 1000))))

        # ====Get text vector====
        pooling = {
            'max': {
                'func': np.max
            },
            'avg': {
                'func': np.sum,
                'normalize': True
            },
            'sum': {
                'func': np.sum,
                'normalize': False
            }
        }
        for p in ['max', 'avg', 'sum']:
            train_df['comment_vec_{}'.format(
                p)] = train_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
            test_df['comment_vec_{}'.format(
                p)] = test_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
        train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'),
                       index=False)
    else:
        for col in train_df.columns:
            if col.startswith('comment_seq'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, int))
                test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int))
            elif col.startswith('comment_vec'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, float))
                test_df[col] = test_df[col].apply(
                    lambda x: parse_seq(x, float))

    logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embeds_word.matrix, axis=1) == 0)))

    # ====END OF `PREPROCESS`====
    if mode == 'preprocess':
        return True

    # ====Train/test split data====
    x = np.array(train_df[seq_col_name_words].values.tolist())
    y = np.array(train_df[target_labels].values.tolist())
    x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data(
        x, y, test_size=0.2, shuffle=True, random_state=42)
    x_test_nn = np.array(test_df[seq_col_name_words].values.tolist())

    x_char = np.array(train_df[seq_col_name_ll3].values.tolist())
    x_char_train_nn = x_char[train_idxs]
    x_char_val_nn = x_char[val_idxs]
    x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist())

    x_train_tfidf = train_df['comment_text_clear'].values[train_idxs]
    x_val_tfidf = train_df['comment_text_clear'].values[val_idxs]
    x_test_tfidf = test_df['comment_text_clear'].values

    catboost_cols = catboost_features(train_df, test_df)
    x_train_cb = train_df[catboost_cols].values[train_idxs].T
    x_val_cb = train_df[catboost_cols].values[val_idxs].T
    x_test_cb = test_df[catboost_cols].values.T

    # ====Train models====
    nn_models = {'cnn': cnn, 'dense': dense, 'rnn': rnn}

    params = Params(config)

    metrics = {}
    predictions = {}
    for param in params['models']:
        for model_label, model_params in param.items():
            if model_params.get('common', {}).get(
                    'warm_start', False) and os.path.exists(
                        model_params.get('common', {}).get('model_file', '')):
                logger.info('{} warm starting...'.format(model_label))
                model = load_model(
                    model_params.get('common', {}).get('model_file', None))
            elif model_label in nn_models:
                model = nn_models[model_label](embeds_word.matrix,
                                               embeds_ll3.matrix,
                                               num_classes,
                                               max_seq_len,
                                               max_char_seq_len,
                                               gpus=gpus,
                                               **model_params['init'])
                model_alias = model_params.get('common', {}).get('alias', None)
                if model_alias is None or not model_alias:
                    model_alias = '{}_{}'.format(model_label, i)
                logger.info("training {} ...".format(model_label))
                if model_label == 'dense':
                    x_tr = [x_train_nn, x_char_train_nn]
                    x_val = [x_val_nn, x_char_val_nn]
                    x_test = [x_test_nn, x_char_test_nn]
                else:
                    x_tr = x_train_nn
                    x_val = x_val_nn
                    x_test = x_test_nn
                hist = train(x_tr,
                             y_train,
                             model,
                             logger=logger,
                             **model_params['train'])
                predictions[model_alias] = model.predict(x_val)
                save_predictions(test_df, model.predict(x_test), target_labels,
                                 model_alias)
            elif model_label == 'tfidf':
                model = TFIDF(target_labels, **model_params['init'])
                model.fit(x_train_tfidf, y_train, **model_params['train'])
                predictions[model_alias] = model.predict(x_val_tfidf)
                save_predictions(test_df, model.predict(x_test_tfidf),
                                 target_labels, model_alias)
            elif model_label == 'catboost':
                model = CatBoost(target_labels, **model_params['init'])
                model.fit(x_train_cb,
                          y_train,
                          eval_set=(x_val_cb, y_val),
                          use_best_model=True)
                predictions[model_alias] = model.predict_proba(x_val_cb)
                save_predictions(test_df, model.predict_proba(x_test_cb),
                                 target_labels, model_alias)
            metrics[model_alias] = get_metrics(y_val, predictions[model_alias],
                                               target_labels)
            logger.debug('{} params:\n{}'.format(model_alias, model_params))
            logger.debug('{} metrics:\n{}'.format(
                model_alias, print_metrics(metrics[model_alias])))
            model.save(
                os.path.join(output_dir, model_params['common']['model_file']))

    logger.info('Saving metrics...')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as f:
        f.write(json.dumps(metrics))

    # ====END OF `VALIDATE`====
    if mode == 'validate':
        return True

    # Meta catboost
    logger.info('training catboost as metamodel...')

    x_meta = [
        predictions[model_alias] for model_alias in sorted(predictions.keys())
    ]
    x_meta = np.array(x_train_meta).T

    x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split(
        x_meta, y_val, test_size=0.20, random_state=42)
    meta_model = CatBoost(target_labels,
                          loss_function='Logloss',
                          iterations=1000,
                          depth=6,
                          learning_rate=0.03,
                          rsm=1)
    meta_model.fit(x_train_meta,
                   y_train_meta,
                   eval_set=(x_val_meta, y_val_meta),
                   use_best_model=True)
    y_hat_meta = meta_model.predict_proba(x_val_meta)
    metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels)
    #model.save(os.path.join(output_dir, 'meta.catboost')
    logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta)))

    # ====Predict====
    logger.info('Applying models...')
    test_cols = []
    for model_alias in sorted(predictions.keys()):
        for label in target_labels:
            test_cols.append('{}_{}'.format(model_alias, label))
    x_test = test_df[test_cols].values

    preds = meta_model.predict_proba(x_test)
    for i, label in enumerate(target_labels):
        test_df[label] = preds[:, i]

    # ====Normalize probabilities====
    if norm_prob:
        for label in target_labels:
            test_df[label] = norm_prob_koef * test_df[label]

    # ====Save results====
    logger.info('Saving results...')
    test_df[[
        'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate'
    ]].to_csv(result_fname, index=False, header=True)
    test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)
Ejemplo n.º 13
0
    data = pd.DataFrame({'PATH': img_list, 'COUNTS': face_counts})
    return data

def print_output(paths, counts):
    for i, path in enumerate(paths):
        print(config.OUTPUT_FORMAT % (path.name, counts[i]))

if __name__ == '__main__':
    #Path.cwd().n
    # Parse arguments
    parser = argparse.ArgumentParser(description='Train/Test Model')
    parser.add_argument('path', type=existing_path, help='image file/directory for testing')
    parser.add_argument('--model', type=str, default='MTCNN', choices=['MTCNN'], help='The model to use for face counts, default MTCNN')
    parser.add_argument('--print-metrics', action='store_true', help='Compute and Output any metrics')
    args = parser.parse_args()
    logger = get_logger()
    logger.info("App Log initiated")

    data = get_data(args.path)
    logger.info("Data built with size %d" %(len(data)))

    model = models.get_model(args.model)()
    logger.info("Model built")

    counts = model.count_faces(data['PATH'])
    print_output(data['PATH'], counts)
    if args.print_metrics:
        metrics.print_metrics(data, counts)

    logger.info("Output/metrics printed, app done!")
Ejemplo n.º 14
0
from metrics import print_metrics
from PIL import Image
import numpy as np

image = Image.open('/home/maria/Documents/test/pics/lena.jpg')
image = image.convert('L')  # convert the image to *greyscale*
image = np.array(image)
print_metrics(image, image, image)
    def run_train_and_save_classifier(self,
                                      save_classifier=True,
                                      locs_in=["SAB"],
                                      data_split_ratio=0.15):

        locations = utilities.get_locations(locs_in)

        print(" \n\n--- Train a classifier for locations: {}\n".format(
            locations))
        print("Data split into train and test set at ratio: {}\n".format(
            data_split_ratio))

        data_in = self.data.get_data()
        data_in = data_in[data_in["Environment"].isin(locations)]

        label_encoder, pipeline = utilities.prepare_skl_interface(
            data_in, self.classifier)

        # stratified split data into train and test - stratification considers location and the class labels

        temp_df = data_in[['Environment', 'Recording ID', 'Class']]
        temp_df = temp_df.drop(
            temp_df[temp_df['Class'] ==
                    'front'].index)  # just to avoid repeated Recording IDs

        train_bags, test_bags = train_test_split(
            temp_df,
            test_size=data_split_ratio,
            random_state=self.random_state,
            stratify=temp_df[['Environment', 'Class']])

        # check if samples from same recordings are present in both train and test
        for bag in list(test_bags['Recording ID']):
            if bag in list(train_bags['Recording ID']):
                print("Error: {}".format(bag))

        train_data = data_in[data_in['Recording ID'].isin(
            train_bags['Recording ID'])]
        test_data = data_in[data_in['Recording ID'].isin(
            test_bags['Recording ID'])]

        accuracy, conf_mat = utilities.train_and_test(
            train_data,
            test_data,
            pipeline,
            label_encoder,
            self.srp_dict,
            save_cls=save_classifier,
            out_folder=self.output_folder)

        all_metrics = {
            "overall_accuracy": (accuracy, 0),
            "per_class_accuracy":
            (metrics.getPCaccuracy(conf_mat), np.zeros(4)),
            "per_class_precision":
            (metrics.getPCPrecision(conf_mat), np.zeros(4)),
            "per_class_recall": (metrics.getPCRecall(conf_mat), np.zeros(4)),
            "per_class_iou": (metrics.getPCIoU(conf_mat), np.zeros(4))
        }

        metrics.print_metrics(all_metrics, self.paper_metrics_only)
Ejemplo n.º 16
0
    def _train_one_epoch(self, criterion, optimizer, training_data_loader,
                         train_metrics, train_metrics_results, epoch,
                         global_step, scheduler):

        aggregate_batches = 1
        for m in train_metrics:
            m.reset()

        if self.state.cuda:
            self.model.cuda()

        self.model.train()

        optimizer.zero_grad()
        for idx, batch in enumerate(training_data_loader):

            assert (isinstance(batch[0], list) and isinstance(batch[1], list))
            data = [Variable(b) for b in batch[0]]
            target = [Variable(b, requires_grad=False) for b in batch[1]]

            if self.state.cuda:
                data = [d.cuda() for d in data]
                target = [t.cuda() for t in target]

            output = self.model(data)

            if isinstance(criterion, (tuple, list)):
                loss_val = [c(output, target) for c in criterion]
                loss = sum(loss_val) / (len(loss_val))
            else:
                loss_val = criterion(output, target)
                loss = loss_val

            loss.backward()

            if (idx + 1) % aggregate_batches == 0:

                #for name, param in self.model.named_parameters():
                #    self.tb_writer.add_scalar('misc/grad-max-{}'.format(name), torch.max(torch.abs(param.grad)).cpu().numpy(), global_step)

                #for param in self.model.parameters():
                #    param.grad.data = torch.clamp(param.grad.data, min=-1.0,max=1.0)

                optimizer.step()
                optimizer.zero_grad()
                if scheduler is not None:
                    scheduler.step()

            for m in train_metrics:
                m.update(output, target)

            for idx, l in enumerate(loss_val):
                self.tb_writer.add_scalar('loss/loss-{}'.format(idx), l.item(),
                                          global_step)

            for idx, param_group in enumerate(optimizer.param_groups):
                self.tb_writer.add_scalar('misc/lr-{}'.format(idx),
                                          param_group['lr'], global_step)

            global_step = global_step + 1

        for m in train_metrics:
            train_metrics_results[m.name].append(m.get())
            metrics.print_metrics(self.tb_writer, m, 'train/', epoch)

        self.state.optimizer_state = optimizer.state_dict()
        return global_step
Ejemplo n.º 17
0
    def _evaluate_and_save(self, evaluation_data_loader, split_into_tiles,
                           val_metrics, track_metric, val_metrics_results,
                           epoch, comparator):

        for m in val_metrics:
            m.reset()

        self.model.eval()

        for batch in evaluation_data_loader:
            gc.collect()
            #torch.cuda.empty_cache()

            assert (isinstance(batch[0], list) and isinstance(batch[1], list))

            data = batch[0]
            target = batch[1]

            if split_into_tiles and not self.eval_cpu:

                #TODO: this is a workaround to support tiling for only signle input
                #       add tiling for selected inputs ( not just the 0th one)
                output = torch.zeros_like(batch[1][0])

                input = batch[0][0]

                tile_shape = (192, 192, 192)
                center_shape = (48, 48, 48)
                border = (72, 72, 72)

                grid = [
                    int(np.ceil(j / i))
                    for i, j in zip(center_shape, input.shape[2:])
                ]

                for i in range(grid[0]):
                    for j in range(grid[1]):
                        for k in range(grid[2]):
                            index_min, index_max = loader_helper.get_indices(
                                position=(i, j, k),
                                center_shape=center_shape,
                                border=border)
                            tile = loader_helper.copy(data=input,
                                                      tile_shape=tile_shape,
                                                      index_min=index_min,
                                                      index_max=index_max)

                            if self.state.cuda:
                                tile = tile.cuda()
                            with torch.no_grad():
                                out = self.model([tile])[0].detach().cpu()

                            loader_helper.copy_back(data=output,
                                                    tile=out,
                                                    center_shape=center_shape,
                                                    index_min=index_min,
                                                    index_max=index_max,
                                                    border=border)

                output = [output]

            elif self.eval_cpu:
                tmp_model = self.model.module.cpu()
                tmp_model.eval()
                with torch.no_grad():
                    output = tmp_model(data)

            else:
                with torch.no_grad():
                    if self.state.cuda:
                        data = [d.cuda() for d in data]
                        target = [t.cuda() for t in target]

                    output = self.model(data)

            for m in val_metrics:
                m.update(target, output)

        val = 0.0
        for m in val_metrics:
            if m.name == track_metric:
                val = m.get()

            metrics.print_metrics(self.tb_writer, m, 'val/', epoch)
            val_metrics_results[m.name].append(m.get())

        if comparator(val, self.state.best_val):
            self.state.best_val = val
            self._save(suffix='best_model')
            print('model saved')
Ejemplo n.º 18
0
    parser = argparse.ArgumentParser(description="Simulation parameters")
    parser.add_argument("--config", help="YAML config")
    args = parser.parse_args()

    with open(args.config) as cfg:
        config = yaml.load(cfg, Loader=yaml.FullLoader)

    configure_root_logger()
    configure_csv_logger(config["simulation"]["output"])

    context, demand = create_scenario(config)

    if config["solvers"]["greedy_matcher"]["router"] == "linear":
        router = routers.LinearRouter(context.clock,
                                      config["routers"]["linear"]["speed"])
    elif config["solvers"]["greedy_matcher"]["router"] == "osrm":
        router = routers.OSRMRouter(context.clock,
                                    server=config["routers"]["osrm"]["server"])
    else:
        raise Exception("Unknown router")

    logging.info(f"Matcher router {router}")

    matcher = GreedyMatcher(context, router, config)

    simulator = Simulator(matcher, context)
    simulator.simulate(demand, config["simulation"]["duration"])

    print_metrics(config["simulation"]["output"], context.clock)
Ejemplo n.º 19
0
            pcd3 = pred_ref_cloud
            o3d.visualization.draw_geometries([pcd1, pcd2, pcd3])

    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)

    return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic


if __name__ == '__main__':
    seed = 222
    random.seed(seed)
    np.random.seed(seed)

    args = config_params()

    test_set = CustomData(args.root, args.infer_npts, False)
    test_loader = DataLoader(test_set, batch_size=1, shuffle=False)

    if args.method == 'benchmark':
        dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
            evaluate_benchmark(args, test_loader)
        print_metrics(args.method,
                      dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)
    elif args.method == 'icp':
        dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
            evaluate_icp(args, test_loader)
        print_metrics(args.method, dura, r_mse, r_mae, t_mse, t_mae, r_isotropic,
                      t_isotropic)
    else:
        raise NotImplementedError
Ejemplo n.º 20
0
def train_model(model,
                dataloaders,
                policy_learner,
                optimizer,
                scheduler,
                num_epochs,
                device,
                writer,
                n_images=None):
    loader = {'val': dataloaders['val']}

    # best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 1e10
    if n_images is None:
        n_images = {'train': 0, 'val': 0}

    for epoch in range(num_epochs):
        loader['train'] = policy_learner()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        since = time.time()
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            # print('+++++++++ len loader', len(loader[phase]))
            if phase == 'train':
                if scheduler:
                    scheduler.step()
                for param_group in optimizer.param_groups:
                    print("LR", param_group['lr'])
                model.train()  # Set model to training mode
            else:
                model.eval()  # Set model to evaluate mode

            metrics = defaultdict(float)
            epoch_samples = 0

            for enum_id, (idxs, inputs,
                          labels) in tqdm(enumerate(loader[phase]),
                                          total=len(loader[phase])):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # if phase == 'train' and enum_id < 3:
                #     for idx in idxs:
                #         torch.save(torch.tensor(1),
                #                    f'tmp/trash/{policy_learner.__class__.__name__}_{epoch}_{enum_id}__{idx}'
                #                    )

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    # loss, loss_sum, loss_bce, loss_dice = calc_loss(outputs, labels, 0)
                    loss = dice_loss(outputs, labels)
                    acc_f1 = calc_f1(outputs, labels)
                    # acc_iou = calc_IOU(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        plot_grad_flow(epoch, enum_id,
                                       model.named_parameters())
                        optimizer.step()

                # statistics
                epoch_samples += inputs.size(0)
                n_images[phase] += inputs.size(0)

                writer.add_scalar(f'{phase}/loss',
                                  loss.data.cpu().numpy(), n_images[phase])
                # writer.add_scalar(f'{phase}/bce', loss_bce, n_images[phase])
                # writer.add_scalar(f'{phase}/dice', loss_dice, n_images[phase])

                metrics['loss'] += loss * inputs.size(0)
                metrics['f1'] += acc_f1 * inputs.size(0)
                # metrics['iou'] += acc_iou * inputs.size(0)

            print_metrics(writer, metrics, epoch_samples, phase)
            epoch_loss = metrics['loss'] / epoch_samples
            writer.add_scalar(f'{phase}/epoch_loss', epoch_loss, epoch)
            epoch_f1 = metrics['f1'] / epoch_samples
            writer.add_scalar(f'{phase}/epoch_F1', epoch_f1, epoch)
            # epoch_iou = metrics['iou'] / epoch_samples
            # writer.add_scalar(f'{phase}/epoch_IOU', epoch_iou, epoch)

            # # deep copy the model
            # if phase == 'val' and epoch_loss < best_loss:
            #     print("saving best model")
            #     best_loss = epoch_loss
            #     best_model_wts = copy.deepcopy(model.state_dict())

        time_elapsed = time.time() - since
        print('{:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

    print('Best val loss: {:4f}'.format(best_loss))

    # load best model weights
    # model.load_state_dict(best_model_wts)
    return model, n_images