def build_dataloader(args):
    """build data loader for model training and validding

    Args:
        args (argparse.Namespace): Command parameter parser

    Returns:
        DataLoader: train dataloader
        DataLoader: valid dataloader
        Vocab: vocab of corpus
    """

    max_seq_len = args.max_seq_len
    valid_rate = args.valid_rate
    batch_size = args.batch_size

    # Get training data and validation data set
    print('Getting the data...')
    data = get_data(args.data_json_path, args.wv_name, args.formated_data_path,
                    max_seq_len)
    x, y, vocab = data['x'], data['y'], data['vocab']

    # get dataloader
    dataset = gluon.data.SimpleDataset([[field1, field2]
                                        for field1, field2 in zip(x, y)])
    train_dataset, valid_dataset = nlp.data.train_valid_split(
        dataset, valid_rate)

    train_dataloader = get_dataloader(train_dataset, batch_size, is_train=True)
    valid_dataloader = get_dataloader(valid_dataset,
                                      batch_size,
                                      is_train=False)

    return train_dataloader, valid_dataloader, vocab
__author__ = 'joseph'

import prepare_data as prepare
import evaluate
import io_helper
from sklearn.neighbors import KNeighborsClassifier

train_data, validation_data, test_data, basic_users_info = prepare.get_data()
label_encoder = {}
train_x, train_y = prepare.get_exclude_ndf_x(train_data, basic_users_info, label_encoder)
validation_x, validation_y = prepare.get_exclude_ndf_x(validation_data, basic_users_info, label_encoder)

# max_ndcg = 0
# max_k = 0
# k_ndcg = {}
# for k in range(1, 100):
#     neighbor_classifier = KNeighborsClassifier(n_neighbors=k)
#     neighbor_classifier.fit(train_x, train_y)
#     validation_predict = neighbor_classifier.predict(validation_x)
#     predict_list = [[predict] for predict in validation_predict]
#
#     ndcg = evaluate.ndcg(predict_list, validation_data)
#     k_ndcg.setdefault(k, ndcg)
#     if ndcg > max_ndcg:
#         max_ndcg = ndcg
#         max_k = k
#         print(max_ndcg, max_k)
#
# io_helper.write_map_data(k_ndcg, '../records/k_neighbors_classifier.csv')
# (0.8724597056762439, 25)
Example #3
0
    for image in images:
        extracted_images.append(
            np.array([
                average_intensity(image),
                detect_vertical_line(image),
                detect_horizontal_line(image),
                enclosed_space(image),
                average_horizontal_std(image),
                average_vertical_std(image)
            ]))
    return extracted_images


# load data set
(training_images, training_labels, validation_images, validation_labels,
 test_images, test_labels) = prepare_data.get_data()

# create classifier and fit to training data
baseline_classifier = tree.DecisionTreeClassifier(criterion="entropy")
tuned_classifier = tree.DecisionTreeClassifier(criterion="entropy",
                                               max_depth=9,
                                               min_samples_leaf=6)
feature_extracted_classifier = tree.DecisionTreeClassifier(criterion="entropy")

# -----------------------------------------------------------
# Select the classifier to be used (comment out all but one)

classifier = baseline_classifier
# classifier = tuned_classifier
# classifier = feature_extracted_classifier
import sys

sys.path.append('..')
from util import *

from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

from prepare_data import get_data

X_all, y_all, groups, feature_names, subjects, labels, class_names = get_data(
    use_precomputed=True)

try:
    # feature_index = [('90' in x and 'deriv' in x and 'eog h' in x) for x in feature_names].index(True)
    feature_index = [('max of eog v' in x)
                     for x in feature_names].index(True)  #super great
    # feature_index = [('min' in x and 'deriv' in x and 'eog l' in x) for x in feature_names].index(True)

    # feature_index = [('std dev of eog h' in x) for x in feature_names].index(True) #meh
    # feature_index = [('std dev of eog v' in x) for x in feature_names].index(True) #quite good
    # feature_index = [('std dev of eog r' in x) for x in feature_names].index(True) #same as previous
    # feature_index = [('std dev of eog l' in x) for x in feature_names].index(True) #same as previous

    # feature_index = [('energy of eog l' in x) for x in feature_names].index(True) #different std devs
    # feature_index = [('max of gyro z' in x) for x in feature_names].index(True) #very bad
except ValueError as e:
    print("Feature not found. Exiting.")
    exit()
import sys
sys.path.append('..')
from util import *

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering, KMeans

from prepare_data import get_data



X_all, y_all, groups, feature_names, subjects, labels, class_names, is_moving_data, include_eog, include_imu = get_data(use_precomputed=True)


def rand_jitter(arr):
    stdev = 0.05*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev



print("reducing dimensions with PCA")
plt.figure(1)
pca = PCA(n_components=2)
pca.fit(X_all)
X_all_reduced = pca.transform(X_all)
for i,label in enumerate(labels):
    mask = np.where(y_all == label)
Example #6
0
    use_response_similarity = False  # Can't use with discussion
    use_book_similarity = True

    # ---

    # Defining some key variables that will be used later on in the training
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 4
    VALID_BATCH_SIZE = 2
    N_EPOCHS = 16
    LEARNING_RATE = 1e-05
    print("DistilBertTokenizer")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

    # Get data
    mes_train, mes_test, class_train, class_test, book_idx_train, book_idx_test, response_link_train, response_link_test, class_dict = get_data(
        sheet, use_response_similarity, use_book_similarity)

    # intialise data of lists.
    data_train = {'Message': mes_train, 'ENCODE_CAT': class_train}

    data_test = {'Message': mes_test, 'ENCODE_CAT': class_test}

    # Create DataFrame
    df_train = pd.DataFrame(data_train)
    df_test = pd.DataFrame(data_test)

    # Creating the dataset and dataloader for the neural network
    train_dataset = df_train
    test_dataset = df_test
    if len(test_dataset) % 2 != 0:
        test_dataset = test_dataset[:-1]
Example #7
0
def train():
    """Classify first, then propose."""
    log_name = str(datetime.datetime.now()).replace(' ', '_')[:-7]
    os.mkdir(f'./clf/logs/{log_name}')
    os.mkdir(f'./regress/logs/{log_name}')

    callbacks_clf = [
        keras.callbacks.TensorBoard(f'./clf/logs/{log_name}'),
        keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)
    ]
    callbacks_regress = [
        keras.callbacks.TensorBoard(f'./regress/logs/{log_name}'),
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    ]

    data_classify, data_regress, n = prepare_data.get_data()
    model = models.bbox_regressor((128, 128, 3), logits_output=False)

    model_clf = keras.Model(inputs=model.inputs, outputs=model.outputs[0])
    model_clf.compile(
        optimizer=keras.optimizers.Adam(3e-6),  # 1e-5
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        metrics=[keras.metrics.SparseCategoricalAccuracy()])
    train_data_classify, val_data_classify = prepare_data.split_data(
        data_classify, n)
    model_clf.fit(train_data_classify,
                  epochs=10000,
                  validation_data=val_data_classify,
                  callbacks=callbacks_clf)
    # model.save('./clf/first_model.h5')

    # imgs = []
    # for label in prepare_data_v2.classes:
    #     imgs.append(plt.imread(f'./tiny_vid/{label}/000001.JPEG'))
    # imgs = np.array(imgs)
    # rsts = model_clf.predict(imgs)
    # for i, img, label in zip(range(1, imgs.shape[0] + 1), imgs, rsts):
    #     plt.subplot(3, 2, i)
    #     plt.imshow(img)
    #     label = np.argmax(label)
    #     plt.title(prepare_data_v2.classes[label])
    # plt.savefig(f'clf_full_{log_name}.jpg')
    # plt.show()
    #
    # freeze conv layers
    for layer in model.layers:
        name = layer.name
        if 'block' in name:
            layer.trainable = False

    model_regress = keras.Model(inputs=model.inputs, outputs=model.outputs[1])
    model_regress.compile(
        optimizer=keras.optimizers.Adam(5e-7),  # too large
        loss=keras.losses.MeanSquaredError(),
        metrics=[keras.metrics.MeanAbsoluteError()])
    train_data_regress, val_data_regress = prepare_data.split_data(
        data_regress, n)
    model_regress.fit(train_data_regress,
                      epochs=10000,
                      validation_data=val_data_regress,
                      callbacks=callbacks_regress)

    # Visualize

    imgs = []

    for label in prepare_data.classes:
        imgs.append(plt.imread(f'./tiny_vid/{label}/000111.JPEG'))

    imgs = np.array(imgs)

    rsts = model.predict(imgs)

    for i, img, label, cor in zip(range(1, imgs.shape[0] + 1), imgs, rsts[0],
                                  rsts[1]):
        plt.subplot(3, 2, i)
        plt.imshow(img)
        # utils.plot_box_from_xywh(cor)
        utils.plot_box_from_min_max(cor)
        label = np.argmax(label)
        plt.xticks([]), plt.yticks([])
        plt.title(prepare_data.classes[label])

    plt.savefig(f'full_{log_name}.jpg')
    plt.show()
    pass
Example #8
0
                    learning_rate=0.5*learning_rate
                    print('Setting learning rate to %f'%learning_rate)
                sys.stdout.flush()

            # For each training example ...
            for i in range(len(y_train)):
                self.numpy_sdg_step(X_train[i],y_train[i],learning_rate)
                num_examples_seen+=1

        return losses


if __name__=='__main__':
    src_root = '/Users/jinzixiang/Documents/workspace/python/rnn'
    vocabulary_size = 4000
    X_train,y_train =prepare_data.get_data(src_root,vocabulary_size)
    np.random.seed(10)




    model=RNNNumpy(vocabulary_size)
    predictions=model.predict(X_train[0][0:100])
    print(predictions)

    print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
    print("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

    grade_check_vocabulary_size=100
    model=RNNNumpy(grade_check_vocabulary_size,hidden_dim=10,bptt_truncate=1000)
    model.gradient_check([0,1,2,3],[1,2,3,4])
Example #9
0
from keras.layers import Input, Dense, Dropout, Activation,Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from matplotlib import pyplot as plt
import tensorflow as tf
from keras import backend as K
from keras.callbacks import TensorBoard,EarlyStopping
from keras import regularizers
import prepare_data as pd
import numpy as np 
sess = tf.Session()
K.set_session(sess)

folder = "D:\PickledData\\"

x_train, x_test, Y_train, Y_test, categories = pd.get_data(folder)
print(x_train.shape)
input_img = Input(shape=(5001,1608, 1))
X_train =  x_train.astype('float32')
X_test = x_test.astype('float32')
X_train /= np.amax(X_train) #- 0.5
X_test /= np.amax(X_test) #- 0.5
x_train = np.reshape(X_train, (len(X_train), 5001, 1608, 1))  # adapt this if using `channels_first` image data format
x_test = np.reshape(X_test, (len(X_test), 5001, 1608, 1))  # adapt this if using `channels_first` image data format
#X_train = X_train.reshape((-1,784))
# X_test = X_test.reshape((-1,784))
print(X_train.shape)

Y_train = np_utils.to_categorical(y_train, 10)
Y_test  = np_utils.to_categorical(y_test,10)