Beispiel #1
0
def test_vc():
    print('###')
    print('test_vc')
    print('###')
    neg_ftrs = np.zeros((32, 1))
    pos_ftrs = np.ones((32, 1))
    weights = np.random.randint(4, size=(32, 1))
    features = weights.flatten()
    print('features.shape: ', features.shape)
    for i in range(50):
        weights_tmp = np.random.randint(4, size=(32, 1))
        features = np.vstack((features, weights_tmp.flatten()))
    # labels = np.zeros(neg_ftrs.shape[0])
    # labels = np.concatenate((labels, np.ones(pos_ftrs.shape[0])))
    labels = np.random.randint(2, size=features.shape[0])
    print('features.shape: ', features.shape)
    print('labels.shape: ', labels.shape)
    exit()
    x_train, x_valid, y_train, y_valid = train_test_split(features,
                                                          labels,
                                                          shuffle=True,
                                                          stratify=labels,
                                                          test_size=0.1,
                                                          random_state=42)
    ftrs_sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    x_transformed = ftrs_sel.fit_transform(x_train)
    vc = create_voting_classifier()
    vc.fit(x_transformed, y_train)
    y_pred = vc.predict(x_valid)
    y_pred_train = vc.predict(x_train)
    acc = accuracy_score(y_valid, y_pred)
    print('accuracy: ', acc)
    plot_cm(y_valid, y_pred, 'images/test', 0.5)
Beispiel #2
0
def run_experiment(dataset_df,
                   train_cols,
                   option='validation',
                   method='KNN',
                   n_neighbors=3,
                   hidden_layer_sizes=(100, ),
                   n_trees=100,
                   bagging=False,
                   bagging_kwargs=DEFAULT_BAGGING_KWARGS,
                   show_cm=True,
                   **kwargs):
    if len(train_cols) == 0:
        raise Exception('No columns provided to run_experiment()')

    # Split train, val, test
    train, val, test = split_train_val_test(dataset_df)

    # Seleccionar columnas para entrenar
    x_train = train[train_cols]
    y_train = train['label']

    if option == 'validation' or option == 'val':
        x_val = val[train_cols]
        y_val = val['label']
    elif option == 'test':
        x_val = test[train_cols]
        y_val = test['label']
    else:
        raise Exception(f'Option not recognized: {option}')

    # Elegir clasificador
    if method == 'KNN':
        model = KNN(n_neighbors=n_neighbors, **kwargs)
    elif method == 'SVM':
        model = SVC(**kwargs)
    elif method == 'MLP':
        model = MLP(hidden_layer_sizes=hidden_layer_sizes, **kwargs)
    elif method == 'RF':
        model = RandomForestClassifier(n_estimators=n_trees, **kwargs)
    elif method == 'LDA':
        model = LDA(**kwargs)
    else:
        raise Exception(f'Unkwown model: {method}')

    if bagging:
        model = BaggingClassifier(model, **bagging_kwargs)

    # Entrenar clasificador
    print('Training...')
    model.fit(x_train, y_train)

    # Evaluar modelo
    train_accuracy, train_cm = evaluate(model, x_train, y_train)
    val_accuracy, val_cm = evaluate(model, x_val, y_val)

    print(f'Accuracy: train: {train_accuracy}, {option}: {val_accuracy}')
    if show_cm:
        plot_cm(val_cm, title=f'{option} CM')

    return val_accuracy, val_cm
Beispiel #3
0
def classification(path_pos_class, path_neg_class, path_images):
    """
    # Notes
        Classifies features using a Voting classifier with a 'soft' voting scheme.
    # Arguments
        - path_pos_class: string, path where the keras checkpoints with the weights are stored
        for the positive class images.
        - path_neg_class: string, path where the keras checkpoints with the weights are stored
        for the negative class images.
        - path_images: string, path where to save images (eg: data/images).
    """
    features, labels = assemble_features_found(path_pos_class, path_neg_class,
                                               256, 256, 4, 2)
    print('Number of features: ', features.shape)
    print('Number of labels: ', labels.shape)
    clf = ExtraTreesClassifier(n_estimators=50)
    params_ftrs_selector = grid_search_for_extra_tree(clf, features, labels)
    clf = ExtraTreesClassifier(
        n_estimators=50,
        criterion=params_ftrs_selector['criterion'],
        max_depth=params_ftrs_selector['max_depth'],
        min_samples_split=params_ftrs_selector['min_samples_split'],
        min_samples_leaf=params_ftrs_selector['min_samples_leaf'],
        max_features=params_ftrs_selector['max_features'])
    clf = clf.fit(features, labels)
    model = SelectFromModel(clf, prefit=True)
    x_transformed = model.transform(features)
    print('Number of features after selection: ', x_transformed.shape)
    x_train, x_valid, y_train, y_valid = train_test_split(x_transformed,
                                                          labels,
                                                          shuffle=True,
                                                          stratify=labels,
                                                          test_size=0.2,
                                                          random_state=42)
    name_dt = 'dt'
    name_svm = 'svm'
    name_knn = 'knn'
    vc = create_voting_classifier(name_dt, name_knn, name_svm)
    params_grid = grid_search_for_vc(vc, x_train, y_train, name_dt, name_knn,
                                     name_svm)
    vc = create_voting_classifier(name_dt, name_knn, name_svm, params_grid)
    vc = vc.fit(x_train, y_train)
    print('x_valid.shape: ', x_valid.shape)
    y_pred = vc.predict(x_valid)
    acc = accuracy_score(y_valid, y_pred)
    print('Voting classifier accuracy(y_valid, y_pred): ', acc)
    plot_cm(y_valid, y_pred, path_images, 'valid', 0.5)
    y_pred_train = vc.predict(x_train)
    plot_cm(y_train, y_pred_train, path_images, 'train', 0.5)
    print('Voting classifier accuracy(y_train, y_pred_train): ',
          accuracy_score(y_train, y_pred_train))
def eval_saved_model(args):
    print('Loading data...')
    X_train, X_val, X_test, y_train, y_val, y_test = data.load_data(
        args.feature_extractor)
    X, y = (X_test, y_test) if args.is_test else (X_val, y_val)

    print('Loading model...')
    model = models.load_model(args.model_name)
    model_type = args.model_name[:3]  # will be svm or mlp
    predictions = models.top_1_accuracy(model, X, y, args.exclude_mislabeled)
    if args.top_n:
        models.top_n_accuracy(model, X, y, args.top_n, model_type,
                              args.exclude_mislabeled)

    if args.errors:
        print('Finding misclassifications...')
        cm = utils.calculate_cm(predictions, y)
        cm = utils.normalize_cm(cm)
        misclassifications(cm)
    if args.confusion_matrix:
        print('Creating confusion matrix...')
        cm_path = 'images/cm_' + args.model_name + '.png'
        utils.plot_cm(predictions, y, cm_path)
Beispiel #5
0
def train(train_dataset, valid_dataset, validation_bool, test_dataset,
          fam_dict_path, num_column, num_trains, num_tests, test_file_path,
          args):
    # load model
    model = rna_model.DeepRfam(seq_length=args.seq_length,
                               num_c=num_column,
                               num_filters=args.num_filters,
                               filter_sizes=args.filter_sizes,
                               dropout_rate=args.keep_prob,
                               num_classes=args.num_classes,
                               num_hidden=args.num_hidden)
    print(model.summary())

    # model compile
    model.compile(
        loss=args.loss_function,
        optimizer=eval(f"optimizers.{args.optimizer}")(lr=args.learning_rate),
        metrics=['accuracy'])

    # start and record training history
    if validation_bool:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            validation_data=valid_dataset,
                                            use_multiprocessing=True,
                                            workers=6)
    else:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            use_multiprocessing=True,
                                            workers=6)

    # # test accuracy
    # t1 = time.time()
    # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1)
    # delta_t = time.time() - t1
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")

    # =================================logging=============================================
    local_time = time.strftime("%m-%d_%H-%M", time.localtime())
    # determine log file name and `mkdir`
    if args.log_name is None:
        log_file_name = local_time
    else:
        log_file_name = local_time + '_' + args.log_name
    # os.system(f"mkdir -p {args.log_dir}/{log_file_name}")
    os.makedirs(f"{args.log_dir}/{log_file_name}")

    # save model to .h5 file
    model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5")

    # save the image of model structure
    plot_model(model,
               to_file=f"{args.log_dir}/{log_file_name}/model_structure.png",
               show_shapes=True)

    # save confusion matrix into .csv file
    # prediction = model.predict_generator(test_generator, workers=6, use_multiprocessing=True)
    prediction = model.predict_generator(
        test_generator)  # don't use the multiprocessing

    # get the list of true label
    with open(test_file_path) as f:
        label_list = []
        for line in f:
            line = line.strip()
            seq_index = line.split(',').pop(0)
            if seq_index != '':
                label_list.append(int(seq_index))
            else:
                pass

    prediction = prediction[:len(label_list)]
    prediction_1d = np.array(
        [np.argmax(prediction) for prediction in prediction])
    # print("Length of true label:", len(label_list))
    # print("Length of predict label:", len(prediction_1d))
    utils.cm2csv(true_labels=label_list,
                 predicted_labels=prediction_1d,
                 dict_file=fam_dict_path,
                 save_dir=f"{args.log_dir}/{log_file_name}")
    print('Accuracy:', accuracy_score(label_list, prediction_1d))

    # generate the confusion matrix
    if args.num_classes <= 20:
        utils.plot_cm(true_labels=label_list,
                      predicted_labels=prediction_1d,
                      dict_file=fam_dict_path,
                      title=f'Confusion Matrix',
                      save_dir=f"{args.log_dir}/{log_file_name}")
    else:
        pass

    # draw and save history plot
    utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}")

    # save the classification report
    utils.classification_report(true_labels=label_list,
                                predicted_labels=prediction_1d,
                                dict_file=fam_dict_path,
                                save_dir=f"{args.log_dir}/{log_file_name}",
                                std_out=True)

    # save history to .csv file
    with open(f"{args.log_dir}/history.csv", 'a') as csv:
        print(
            f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{args.num_filters},{args.batch_size},{args.num_epochs},{args.keep_prob},{args.num_hidden},{args.learning_rate},{args.loss_function},{args.optimizer}, ',
            file=csv)
Beispiel #6
0
    args = parser.parse_args()
    cifar_dir = args.cifar_root
    fig_path = args.fig_path
    validation_split = args.val_split
    batch_size = args.batch_size
    epochs = args.epochs
    weight_path = args.weight_path
    weight_decay = args.weight_decay
    lr = args.lr

    SEED = args.seed # set random seed (default as 1234)

    # split train, val, test from `get_data` function
    train_loader, val_loader, test_loader = get_data(cifar_dir=cifar_dir, batch_size=batch_size, augment=True, validation_split=validation_split)

    # load model
    model = VGG_lite()
    # define loss
    loss = nn.CrossEntropyLoss()
    # train the model
    model, history = train(model, train_loader, val_loader, epochs, loss, batch_size, optimizer='adam', weight_decay=weight_decay, lr=lr)

    # save the model accordeing to `weight_path` from parser (default to './weights/final.pth')
    torch.save(model.state_dict(), weight_path)

    plot_history(history, fig_path) # save figures

    acc, cm, cm_norm = evaluate(model, test_loader) # evaluate trained model
    plot_cm(cm, cm_norm, fig_path) # save confusion matrix figures
    print('Test Accuracy: {}%'.format(round(acc*100, 4))) # print the model test accuracy
Beispiel #7
0

    ### Training ###
    model, history_training = train_model(model=model, hist=history_training, criterion=criterion, 
                                          optimizer=optimizer, dataloaders=dataloaders, dataset_sizes=dataset_sizes, 
                                          data_augment=DATA_AUGMENT, scheduler=lr_sched, num_epochs=EPOCHS, patience_es= 15)


    ### Testing ###
    history_training = test_model(model=model, hist=history_training, criterion=criterion, 
                                  dataloaders=dataloaders, dataset_sizes=dataset_sizes)


    ### Save the model ###
    save_model(model=model, hist=history_training, 
               trained_models_path=MODEL_PATH, model_type=MODEL_TYPE, do_save=SAVING)


    ### Plotting the losses ###
    plot_training(hist=history_training, graphs_path=GRAPHS_PATH, 
                  model_type=MODEL_TYPE, do_save=SAVING)


    ### Plotting the CM ###
    plot_cm(hist=history_training, graphs_path=GRAPHS_PATH, 
                  model_type=MODEL_TYPE, do_save=SAVING)


    ### Give the classification report ###
    classif_report(hist=history_training)
Beispiel #8
0
def task2():
    # Create a MobileNet model
    mobile = MobileNet(weights='imagenet')

    # See a summary of the layers in the model
    mobile.summary()

    # Modify the model
    # Exclude the last 5 layers of the model
    x = mobile.layers[-6].output
    # Add a dropout and dense layer for predictions
    x = Dropout(0.25)(x)
    predictions = Dense(7, activation='softmax')(x)

    # Create a new model with the new outputs
    model = Model(inputs=mobile.input, outputs=predictions)

    # See a summary of the new layers in the model
    model.summary()

    # Freeze the weights of the layers that we aren't training (training the last 23)
    for layer in model.layers[:-23]:
        layer.trainable = False


    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # Useful variables
    data_folder = '../res/Task 2/Training'
    test_folder = '../res/Task 2/Test'
    total_train = 8012
    total_test = 2003
    labels = ["AK", "BCC", "BK", "D", "MN", "M", "VL"]
    batch_size = 100 
    epochs = 10

    # this is the augmentation configuration we will use for training
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

    # this is the augmentation configuration we will use for testing:
    # only rescaling
    test_datagen = ImageDataGenerator(rescale=1./255)

    train_generator = train_datagen.flow_from_directory(
        data_folder, class_mode='categorical', batch_size=batch_size, target_size=(224, 224),)
    test_generator = test_datagen.flow_from_directory(
        test_folder, class_mode='categorical', batch_size=batch_size, target_size=(224, 224))
    
    # Try to deal with class imbalance: calculate class_weights so that the minority classes have a larger weight
    # than the majority classes.
    class_weights = class_weight.compute_class_weight(
               'balanced',
                np.unique(train_generator.classes), 
                train_generator.classes)
    class_weights = dict(enumerate(class_weights))

    # Train the model
    model.fit_generator(
        train_generator,
        steps_per_epoch=total_train // batch_size,
        epochs=epochs,
        class_weight=class_weights
        )

    # Evaluate the model accuracy with the testing dataset
    scores = model.evaluate_generator(test_generator, total_test // batch_size)
    print("Test accuracy = ", scores[1])

    # Generate predictions with the test dataset
    # softmax returns a value for each class
    # the predicted class for a given sample will be the one that has the maximum value
    predictions = model.predict_generator(test_generator, total_test // batch_size + 1)
    y_pred = np.argmax(predictions, axis=1)

    # Save the predictions in a csv file
    with open('results2.csv', mode="w") as results_file:
        writer = csv.writer(results_file, delimiter=',',
                        quotechar='"', quoting=csv.QUOTE_MINIMAL)

        for x in predictions:
            writer.writerow(x)

    # Generate confusion matrix and classification report
    # Helps to evaluate metrics such as accuracy, precision, recall
    print('Confusion Matrix')
    cm = confusion_matrix(test_generator.classes, y_pred)
    print(cm)
    plot_cm(cm, labels, "second.png")

    print('Classification Report')
    print(classification_report(test_generator.classes, y_pred, target_names=labels))
Beispiel #9
0
def task1_CNN():
    # Build the model
    model = Sequential()

    model.add(Conv2D(16, (3, 3), input_shape=(150, 150, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first"))
    model.add(BatchNormalization())

    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first"))
    model.add(BatchNormalization())

    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), data_format="channels_first"))

    # this converts our 3D feature maps to 1D feature vectors
    # add 1 dropout layer in order to prevent overfitting
    model.add(Flatten())
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # compile the model
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    # Useful variables
    data_folder = '../res/Task 1/Training'
    test_folder = '../res/Task 1/Test'
    total_train = 900
    total_test = 379
    batch_size = 100  # Higher batch size than usual in order to have a higher probability of encountering malignant samples in each batch
    epochs = 50
    labels = ["bening", "malignant"]

    # this is the augmentation configuration we will use for training
    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True)

    train_generator = train_datagen.flow_from_directory(
        data_folder,
        class_mode='binary',
        batch_size=batch_size,
        target_size=(150, 150),
    )

    # this is the augmentation configuration we will use for testing:
    # only rescaling
    test_datagen = ImageDataGenerator(rescale=1. / 255)

    test_generator = test_datagen.flow_from_directory(
        test_folder,
        class_mode='binary',
        batch_size=batch_size,
        target_size=(150, 150),
    )

    # Try to deal with class imbalance: calculate class_weights so that the malignant class has a larger weight
    # than the bening class.
    counter = Counter(train_generator.classes)
    max_val = float(max(counter.values()))
    class_weights = {
        class_id: max_val / num_images
        for class_id, num_images in counter.items()
    }

    # Train the model
    model.fit_generator(train_generator,
                        steps_per_epoch=total_train // batch_size,
                        epochs=epochs,
                        class_weight=class_weights)

    # Evaluate the model accuracy with the testing dataset
    scores = model.evaluate_generator(test_generator, total_test // batch_size)
    print("Test accuracy = ", scores[1])

    # Generate predictions with the test dataset
    # sigmoid returns a value between 0 and 1, with 0.5
    # if the value is lower than 0.5, then the model believes the sample is bening
    # if the value is bigger than 0.5, then the model believes the sample is malignant
    # The lower the value (close to 0), the most confidence the sample belongs to the bening class
    # The higher the value (close to 1), the most confidence the sample belongs to the malignant class
    predictions = model.predict_generator(test_generator,
                                          total_test // batch_size + 1)
    predicted_classes = [1 * (x[0] >= 0.5) for x in predictions]

    # Save the predictions in a csv file
    with open('results.csv', mode="w") as results_file:
        writer = csv.writer(results_file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)

        for x in predictions:
            writer.writerow(x)

    # Generate confusion matrix and classification report
    # Helps to evaluate metrics such as accuracy, precision, recall
    true_classes = test_generator.classes
    class_labels = list(test_generator.class_indices.keys())

    print('Confusion Matrix')
    cm = confusion_matrix(true_classes, predicted_classes)
    print(cm)
    plot_cm(cm, labels, "first.png")

    print('Classification Report')
    print(
        classification_report(true_classes,
                              predicted_classes,
                              target_names=class_labels))
Beispiel #10
0
def train(train_dataset, valid_dataset, validation__bool, test_dataset,
          label_list, fam_path, num_channels, num_trains, num_valids,
          num_tests, args):
    # load model
    model = rna_model.L5CFam(seq_length=args.seq_length,
                             num_filters=args.num_filters,
                             num_channels=num_channels,
                             filter_sizes=args.filter_sizes,
                             dropout_rate=args.keep_prob,
                             num_classes=args.num_classes,
                             num_hidden=args.num_hidden)
    print(model.summary())

    # model compile
    model.compile(loss=args.loss_function,
                  optimizer=args.optimizer,
                  metrics=['accuracy'])

    # start and record training history
    if validation__bool:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            validation_data=valid_dataset,
                                            workers=6,
                                            use_multiprocessing=True)
    else:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            workers=6,
                                            use_multiprocessing=True)

    # # test accuracy
    # t1 = time.time()
    # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1)
    # delta_t = time.time() - t1
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")

    # =================================logging=============================================
    local_time = time.strftime("%m-%d_%H-%M", time.localtime())
    # determine log file name and `mkdir`
    if args.log_name is None:
        log_file_name = local_time
    else:
        log_file_name = local_time + '_' + args.log_name
    # os.system(f"mkdir -p {args.log_dir}/{log_file_name}")
    os.makedirs(f"{args.log_dir}/{log_file_name}")

    # save model to .h5 file
    model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5")

    # save the image of model structure
    plot_model(model,
               to_file=f"{args.log_dir}/{log_file_name}/model_structure.png",
               show_shapes=True)

    # save confusion matrix into .csv file
    prediction = model.predict_generator(test_dataset,
                                         workers=6,
                                         use_multiprocessing=True)
    prediction_1d = np.array(
        [np.argmax(prediction) for prediction in prediction])
    # generate the list of the true label
    # label_list = np.zeros((num_tests,), dtype=int)
    # no_label = 0
    # for i in range(1, num_tests):
    #     if i % int(num_tests / args.num_classes) == 0:
    #         no_label += 1
    #     label_list[i] = no_label

    utils.cm2csv(true_labels=label_list,
                 predicted_labels=prediction_1d,
                 dict_file=fam_path,
                 save_dir=f"{args.log_dir}/{log_file_name}")
    print('Accuracy:', accuracy_score(label_list, prediction_1d))

    # draw and save history plot
    utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}")

    # generate the confusion matrix
    if args.num_classes <= 20:
        utils.plot_cm(true_labels=label_list,
                      predicted_labels=prediction_1d,
                      dict_file=fam_dict_path,
                      title=f'Confusion Matrix',
                      save_dir=f"{args.log_dir}/{log_file_name}")
    else:
        pass

    # save the classification report
    utils.classification_report(true_labels=label_list,
                                predicted_labels=prediction_1d,
                                dict_file=fam_dict_path,
                                save_dir=f"{args.log_dir}/{log_file_name}",
                                std_out=True)

    # save history to .csv file
    with open(f"{args.log_dir}/history.csv", 'a') as csv:
        print(
            f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{str(args.num_filters).replace(",","")},{args.batch_size},{args.num_epochs},{args.keep_prob},{str(args.num_hidden).replace(",","")},{args.learning_rate},{args.loss_function},{args.optimizer}, ',
            file=csv)