Ejemplo n.º 1
0
def main():
    # Data processing
    df = dp.load_data('pol_regression.csv')

    x_train, y_train, x_test, y_test = dp.split_data(df)

    # Create an array to represent the different test errors of each degree
    train_error = []
    test_error = []

    # Plot ground truth
    plt.figure()
    plt.ylim(0, 1.5)
    #plt.plot(x_train, y_train, 'bo')
    plt.plot(x_test, y_test, 'bo')

    colors = ['r', 'y', 'b', 'c', 'k', 'm', 'g']

    # Perform polynomial regression for powers 0 to 10
    for i, degree in enumerate([0, 1, 2, 3, 4, 5, 10]):
        w = 1

        if degree != 0:
            # Calculate the coefficients based on the training values
            w = pol.pol_regression(x_train, y_train, degree)

        # Make predictions for test data
        y_train_hat = pol.prediction(x_train, w, degree)
        y_test_hat = pol.prediction(x_test, w, degree)

        # Plot predictions
        list = zip(*sorted(zip(*(x_test, y_test_hat))))
        plt.plot(*list, color=colors[i])

        # Measure accuracy of model
        # RMSE of training set
        train_error.append(
            pol.eval_pol_regression(y_train_hat, w, x_train, y_train, degree))

        # RMSE of testing set
        test_error.append(
            pol.eval_pol_regression(y_test_hat, w, x_test, y_test, degree))

        print("[Degree: {0}] - Train: {1:.4f}, Test: {2:.4f}".format(
            degree, train_error[i], test_error[i]))

    plt.legend(('ground truth', '$x^0$', '$x$', '$x^2$', '$x^3$', '$x^4$',
                '$x^5$', '$x^{10}$'),
               loc='lower right')
    plt.savefig(os.path.join('images', 'polynomial_split.png'))

    pol.plot_error_graph(train_error, test_error)
def main():
    # data generator
    data_generator = ImageDataGenerator(
        featurewise_center=False,
        featurewise_std_normalization=False,
        rotation_range=0,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=.1,
        horizontal_flip=True)

    model = CNN()
    opt = optimizers.Adam(lr=0.0001)
    # opt = optimizers.SGD(lr=0.001)

    model.compile(opt, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    # callbacks
    f = open(base_path + 'gender_classification_training.log', 'w')
    f.close()
    log_file_path = base_path + 'gender_classification_training.log'
    csv_logger = CSVLogger(log_file_path, append=False)
    early_stop = EarlyStopping('val_loss', patience=patience)
    reduce_lr = ReduceLROnPlateau('val_loss', factor=0.1, patience=int(patience/4), verbose=1)

    trained_models = base_path + 'CNN.{epoch:02d}-{val_loss:.3f}-{val_acc:.2f}.hdf5'
    # model_cp = ModelCheckpoint(trained_models, 'val_acc', verbose=1, save_best_only=True)
    model_cp = ModelCheckpoint(trained_models, 'val_loss', verbose=1, save_best_only=True)
    callbacks = [model_cp, csv_logger, early_stop, reduce_lr]

    # load data
    faces, labels = load_data(data_path)
    print (len(faces))
    print (len(labels))
    faces = preprocess_input(faces)
    order = np.argsort(np.random.random(len(faces)))
    faces = faces[order]
    labels = labels[order]

    train_data, val_data = split_data(faces, labels, validation_split)
    train_faces, train_labels = train_data
    model.fit_generator(data_generator.flow(train_faces, train_labels, batch_size),
                        steps_per_epoch=len(train_faces)/batch_size,
                        epochs=num_epochs,
                        verbose=1,
                        callbacks=callbacks,
                        validation_data=val_data)
def make_filenames_list_from_subdir(src_dir, shape, ratio):
    """
	Use names of subdirs as a id.
	And then calculate class_index from id.
	"""

    class_id_set = set()
    #bottleneck_data = dict()
    feature_vectors, labels, filenames = [], [], []

    image_size = (shape[0], shape[1])
    listdir = os.listdir(src_dir)

    # 1) findout number of classes
    for class_id in listdir:

        subdir = src_dir + '/' + class_id
        if not os.path.isdir(subdir): continue

        if len(os.listdir(subdir)) == 0:
            continue
        else:
            try:
                class_id_int = int(class_id)
                class_id_set.add(class_id_int)
            except:
                continue

    # 2) maps class_id to class_index
    id_list = list(class_id_set)
    id_list.sort()
    print('Number of classes in the sample: {0}'.format(len(id_list)))
    print('Min class id: {0}'.format(min(id_list)))
    print('Max class id: {0}'.format(max(id_list)))
    map_id_label = {class_id: index for index, class_id in enumerate(id_list)}
    map_label_id = {index: class_id for index, class_id in enumerate(id_list)}
    maps = {'id_label': map_id_label, 'label_id': map_label_id}
    num_classes = len(map_id_label)
    NUM_CLASSES = num_classes

    save_labels_to_file(settings.LABELS_FILE,
                        map_label_id)  # create the file labels.txt

    for class_id in class_id_set:

        subdir = src_dir + '/' + str(class_id)
        print(subdir)
        files = os.listdir(subdir)
        num_files = len(files)

        for index_file, filename in enumerate(files):

            base = os.path.splitext(filename)[0]
            ext = os.path.splitext(filename)[1]
            if not ext in {'.jpg', ".png"}: continue

            # ????
            #if base.split('_')[-1] != '0p': continue # use only _0p.jpg files

            class_index = map_id_label[class_id]

            label = class_index
            #label = [0]*num_classes
            #label[class_index] = 1

            file_path = subdir + '/' + filename

            #im = Image.open(file_path)
            #im = im.resize(image_size, Image.ANTIALIAS)
            #arr = np.array(im, dtype=np.float32) / 256
            #feature_vector = bottleneck_tensor.eval(feed_dict={ x : [arr] })
            #feature_vectors.append(feature_vector)

            feature_vectors.append(0)  # not used
            filenames.append(file_path)  # filename or file_path
            labels.append(label)

            #im.close()
            print("dir={0}, class={1}: {2}/{3}: {4}".format(
                class_id, class_index, index_file, num_files, filename))

    print('----')
    print('Number of classes: {0}'.format(num_classes))
    print('Number of feature vectors: {0}'.format(len(feature_vectors)))

    data = {
        'images': feature_vectors,
        'labels': labels,
        'filenames': filenames
    }

    # mix data
    if settings.DO_MIX:
        print('start mix data')
        zip3 = list(zip(data['images'], data['labels'], data['filenames']))
        random.shuffle(zip3)
        print('mix ok')
        data['images'] = [x[0] for x in zip3]
        data['labels'] = [x[1] for x in zip3]
        data['filenames'] = [x[2] for x in zip3]

    print('Split data')
    #data = split_data.split_data_v3(data, ratio=ratio)
    data = split_data(data, ratio=ratio, do_balancing=settings.DO_BALANCING)

    assert type(data['train']['labels'][0]) is int
    assert type(data['train']['filenames'][0]) is str
    #print(data['train']['labels'])
    #print(data['train']['filenames'])
    print('TRAIN')
    for i in range(len(data['train']['labels'])):
        print('{0} - {1}'.format(data['train']['labels'][i],
                                 data['train']['filenames'][i]))
    print('VALID')
    for i in range(len(data['valid']['labels'])):
        print('{0} - {1}'.format(data['valid']['labels'][i],
                                 data['valid']['filenames'][i]))

    data['id_label'] = map_id_label
    data['label_id'] = map_label_id
    data['num_classes'] = num_classes

    return data
Ejemplo n.º 4
0
def split_variation(X=None, y=None, step=0.05):
    """ Function used to split the data into different ratios for the training and the testing data where,
         the training data varies in "step" intervals as specified and the LR classifier is trained
        with the given splits and then we obtain the respective accuracy and F1-Score"""
    if X is None or y is None:
        print("Data not provided.")
        return

    split = 0.00
    splits = []
    accuracies = {"train": [], "test": []}
    f1_scores = {"train": [], "test": []}

    for split in np.arange(step, 1.0, step):
        splits.append(split * 100)
        X_train, X_test, y_train, y_test = split_data(X, y, split)

        classifier = Classifier(model="logistic")
        classifier.train(X_train, y_train)

        classifier.validate(X_train, y_train)
        train_accuracy = classifier.model_accuracy()
        train_score = classifier.model_score()

        classifier.validate(X_test, y_test)
        test_accuracy = classifier.model_accuracy()
        test_score = classifier.model_score()

        accuracies["train"].append(train_accuracy)
        accuracies["test"].append(test_accuracy)
        f1_scores["train"].append(train_score)
        f1_scores["test"].append(test_score)

    splits = np.array(splits)

    training = {
        "accuracy": np.array(accuracies["train"]),
        "score": np.array(f1_scores["train"]),
    }

    testing = {
        "accuracy": np.array(accuracies["test"]),
        "score": np.array(f1_scores["test"]),
    }

    table = pd.DataFrame({
        "Train size (%)": splits,
        "Training Accuracy": training["accuracy"],
        "Training F1-Score": training["score"],
        "Testing Accuracy": testing["accuracy"],
        "Testing F1-Score": testing["score"],
    })

    table.style.set_caption("Variation of performance with train-test split")
    display(table)

    plt.figure(figsize=(15, 10))
    plt.plot(splits, training["accuracy"], c="blue", label="Training Accuracy")
    plt.plot(splits,
             testing["accuracy"],
             c="green",
             label="Validation Accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("Training Set Size (%)")
    plt.title("Training Set Size vs Accuracy")
    plt.legend()
    plt.show()
Ejemplo n.º 5
0
    './car.data',
    'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data')
df.columns = [
    'buying', 'maintenance', 'doors', 'people', 'lug_boot', 'safety', 'class'
]

# CONVERT STRING VALUES TO THEIR NUMERICAL COUNTERPARTS (FASTER CALCULATION)
convert_to_numerical(df,
                     columns=[
                         'buying', 'maintenance', 'doors', 'people',
                         'lug_boot', 'safety', 'class'
                     ],
                     inplace=True)

# SPLIT DATASET INTO TRAINING, VALIDATION, TESTING
training, validation, test = split_data(df, inplace=True)

# CREATE CLASSIFIER AND FIT IT TO TRAINING DATA
training_X = training.iloc[:, :-1]
training_y = training.iloc[:, -1]
my_clf = DecisionTree(metric='gini')
my_clf.fit(training_X, training_y)
sklearn_clf = DecisionTreeClassifier()
sklearn_clf.fit(training_X, training_y)

# SPLIT VALIDATION DATASETS INTO X AND y
validation_X = validation.iloc[:, :-1]
validation_y = validation.iloc[:, -1]

# PRINT METRICS FOR PRUNNED AND UNPRUNNED DECISION TREE
my_predictions = my_clf.predict(validation_X)
def main(_):
    with tf.device('/gpu:0'):

        for regularization_type in ['Blackout', 'None', 'L1', 'L2']:

            dataset_sizes = np.linspace(2500, 55000, num=22)
            for size in dataset_sizes:
                # Getting the appropriate dataset

                print(int(size))
                train_x, train_y, valid_x, valid_y, test_x, test_y = split_data(
                    dataset, int(size))

                # Resetting the graph incase of multiple runs on the same console
                tf.reset_default_graph()
                for i in range(numOfTests):
                    num_layers = random.choice([5, 6, 7, 8, 9, 10])
                    num_nodes = random.choice([200, 400, 600])
                    num_inputs = int(train_x.shape[1])
                    num_steps = random.choice([50, 100, 150, 200])
                    regularization_scale = random.choice(
                        [0.01, 0.005, 0.001, 0.0005])
                    percent_connections_kept = random.choice([0.9, 0.95, 0.85])
                    num_classes = len(np.unique(train_y))

                    print('Test No. ' + str(i) + '/' + str(numOfTests))
                    print('Parameters: ' + str(size) + ',' +
                          regularization_type + ',' + str(num_layers) + ',' +
                          str(num_nodes) + ',' + str(num_steps) + ',' +
                          str(regularization_scale) + ',' +
                          str(percent_connections_kept))

                    # Create the model
                    x = tf.placeholder(tf.float32, [None, num_inputs])
                    y = create_model(x, num_layers, num_nodes, num_classes)

                    # Define loss and optimizer
                    y_ = tf.placeholder(tf.int64, [None])

                    # Retrieving weights and defining regularization penalty
                    weights = tf.trainable_variables()
                    regularization_penalty, blackout_weights = get_regularization_penalty(
                        weights, regularization_scale,
                        percent_connections_kept, regularization_type)

                    # Defining loss and optimizer
                    cross = tf.losses.sparse_softmax_cross_entropy(labels=y_,
                                                                   logits=y)
                    loss = cross + regularization_penalty
                    train_step = tf.train.RMSPropOptimizer(0.001).minimize(
                        loss)

                    # Evaluate Model
                    correct_prediction = tf.equal(tf.argmax(y, 1), y_)
                    accuracy = tf.reduce_mean(
                        tf.cast(correct_prediction, tf.float32))
                    config = tf.ConfigProto()
                    config.gpu_options.allow_growth = True

                    # Initializing session
                    sess = tf.InteractiveSession(config=config)
                    tf.global_variables_initializer().run()

                    # Train
                    #                PercentageOfConnOff=[]
                    #                LossFunctionRegu=[]
                    #                LossFunctionCrossTrain=[]
                    #                LossFunctionCrossValid=[]
                    #
                    numOfBatches = 50
                    all_batches_x, all_batches_y = get_batches(
                        train_x, train_y, numOfBatches)

                    # Train
                    for i in range(num_steps):
                        randomPick = random.randint(0, numOfBatches)
                        #print(str(len(all_batches_x)) + " getting " + str(randomPick))
                        if randomPick == 50:
                            randomPick = 49
                        currentBatchX = all_batches_x[randomPick]
                        currentBatchY = all_batches_y[randomPick]
                        sess.run(train_step,
                                 feed_dict={
                                     x: currentBatchX,
                                     y_: currentBatchY
                                 })
                        # Test trained model
                        if i % 20 == 1:
                            print('Accuracy: ' + str(
                                sess.run(accuracy,
                                         feed_dict={
                                             x: valid_x,
                                             y_: valid_y
                                         })))
    #                            if regularization_type=='Blackout':
    #                                currentWeights=sess.run(blackout_weights)
    #                                part1=currentWeights>-0.01
    #                                part2=currentWeights<0.01
    #                                turnedOff=np.sum(np.logical_and(part1,part2))
    #                                TotalNumOfWeights=float(currentWeights.shape[0])
    #                                LossFunctionCrossTrain.append(sess.run(cross, feed_dict={x: train_x, y_: train_y}))
    #                                LossFunctionCrossValid.append(sess.run(cross, feed_dict={x: valid_x, y_: valid_y}))
    #                                LossFunctionRegu.append(sess.run(regularization_penalty))
    #                                PercentageOfConnOff.append((TotalNumOfWeights-turnedOff)/TotalNumOfWeights)
    #if regularization_type=='Blackout':
    #    fig = plt.figure()
    #    ax1 = fig.add_subplot(1, 2, 1)
    #   ax2 = fig.add_subplot(1, 2, 2)
    #    ax1.plot(PercentageOfConnOff)
    #    ax2.plot(LossFunctionCrossTrain,label='Cross-Entropy Train')
    #    ax2.plot(LossFunctionCrossValid,label='Cross-Entropy Validation')
    #    ax2.plot(LossFunctionRegu,label='Regularization')
    #    ax2.legend()
    #    fig.show()
                    accuracyVal = sess.run(accuracy,
                                           feed_dict={
                                               x: valid_x,
                                               y_: valid_y
                                           })
                    accuracyTest = sess.run(accuracy,
                                            feed_dict={
                                                x: test_x,
                                                y_: test_y
                                            })
                    tf.reset_default_graph()
                    store_results(dataset, regularization_type, num_layers,
                                  num_nodes, num_steps, regularization_scale,
                                  percent_connections_kept, accuracyVal,
                                  accuracyTest, size)
                    print('Accuracy Val: ' + str(accuracyVal) +
                          ' , Accuracy Test: ' + str(accuracyTest))
Ejemplo n.º 7
0
    tensorflow>=1.5.0 
"""

import numpy as np

from sklearn.metrics.classification import accuracy_score
from sklearn import preprocessing

from dbn import SupervisedDBNClassification

import data_processing

# load data
original_train_set, original_test_set = data_processing.split_data(
    npy_data_file='../data/all_train.npy',
    train_portion=0.01,
    split_mode='first',
    save='npy')
#original_train_set = np.load('../data/all_train.npy')
#original_test_set = np.load('../data/test_set.npy')

# fill in missing values
train_set = data_processing.missing_values(original_train_set, method='median')
test_set = data_processing.missing_values(original_test_set, method='median')

# get X and y
X_train = train_set[1:, 1:-1]
#X_scaled_train = preprocessing.scale(X_train)
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled_train = min_max_scaler.fit_transform(X_train)
y_train = train_set[1:, -1]