Esempio n. 1
0
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.

    :param page_name: name of page file
    :param model: dictionary storing data passed from training stage
    :return: The feature vector reduced to 10 dimensions
    """

    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    fvectors_test = images_to_feature_vectors(images_test, bbox_size)

    # Remove noise from with high noise level
    for row in fvectors_test:
        col = row.flatten()
        noise_threshold = np.sum(col < 255) - np.sum(col == 0)

        # If there are a lot of noise detected in the character image, remove the noise
        if noise_threshold > 75:
            row[row < 20] = 0
            row[row > 120] = 255

    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions_test(fvectors_test, model)

    return fvectors_test_reduced
Esempio n. 2
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        print('Applying gaussian filter to page', page_name.split('.')[1])
        images_train = [
            ndimage.gaussian_filter(image, 0.9) for image in images_train
        ]
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')
    model_data['pca_matrix'] = learn_pca(fvectors_train_full)
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()

    print('Generating dictionaries of words for evaluation stage')
    model_data = generate_dictionaries(model_data)
    return model_data
Esempio n. 3
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
Esempio n. 4
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')
    v = principal_components(fvectors_train_full, 40)
    model_data['v'] = v.tolist()
    model_data['mean'] = np.mean(fvectors_train_full).tolist()
    reduced = np.dot((fvectors_train_full - np.mean(fvectors_train_full)), v)
    f = get_ten(reduced, model_data)
    model_data['f'] = f.tolist()
    model_data['fvectors_train'] = reduced[:, f].tolist()

    return model_data
Esempio n. 5
0
def process_training_data(train_page_names, noise='saltandpepper'):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    noise - String, default is "saltandpepper", other option is "gaussian" and is
        used to determine type of noise to use
    """
    print('- Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)
    print('- Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)
    clean = fvectors_train_full[0::2]
    noisy = fvectors_train_full[1::2]
    model_data = dict()
    model_data['bbox_size'] = bbox_size
    # combine labels differently to match the way we use train data
    model_data['labels_train'] = np.concatenate(
        (labels_train[0::2], labels_train[1::2])).tolist()
    print('- Adding noise to a half of the data')
    if (noise == 'gaussian'):
        # Gaussian noise
        print('-- Gaussian noise')
        for i in range(noisy.shape[0]):
            gauss = np.random.normal(0, 0.1**0.5, (noisy[i].shape[0])).reshape(
                noisy[i].shape[0])
            noisy[i] += gauss
    else:
        # Salt and pepper noise
        print('-- Salt and pepper noise')
        for i in range(noisy.shape[0]):
            # Makr a copy
            copy = noisy[i]
            # Convert to floats between and inclusive to 0 and 1
            copy.astype(np.float16, copy=False)
            copy = np.multiply(copy, (1 / 255))
            # Create some noise
            noise = np.random.randint(20, size=(copy.shape[0]))
            # When the noise has a zero, add a pepper to the copy
            copy = np.where(noise == 0, 0, copy)  # pepper (black is = 0)
            # When the noise has a value equal to the top, add a salt to the copy
            copy = np.where(noise == (19), 1, copy)  # salt (white is = 1)
            # Convert back to values out of 255 (RGB)
            noisy[i] = np.multiply(copy, (255))

    print('- Reducing to 10 dimensions')
    fvectors_train_clean, fvectors_train_noisy = reduce_dimensions(
        np.concatenate((clean, noisy), axis=0), model_data, "Train",
        noisy.shape[0])
    # add training clean and noisy samples together and save in model
    model_data['fvectors_train'] = np.concatenate(
        (fvectors_train_clean, fvectors_train_noisy)).tolist()

    return model_data
Esempio n. 6
0
def load_test_page(page_name, model):
    """
    Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    images_test_final = []

    # for every test image, apply a median filter to image to reduce noise
    # and store these new images as the images to use for testing
    for image in images_test:
        # img_contr = increase_contrast_image(image, 150) -- commented out as reduces accuracy
        noise_red = ndimage.median_filter(image, 3)
        images_test_final.append(noise_red)
    images_test_final = np.array(images_test_final)

    fvectors_test = images_to_feature_vectors(images_test_final, bbox_size)
    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions_test(fvectors_test, model)

    return fvectors_test_reduced
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix. Also
    as the noise on the pages is salt and pepper noise, a median
    filter is also applied to the test data to reduce some of the
    noise. An attempt of noise level detection has been attempted
    but not successful. This was going to be done in order to
    tune the KNN nearest neighbour according to the noise, so that
    the more noise on the page the bigger the KNN value.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    # For every row in images_test,reduce the noise
    reduced_noise = list(map(noise_reduction, images_test))

    # Tried working out the noise
    # count=0
    # for i in range (len(images_test)):
    #     for x in range (len(images_test[i])):
    #         if(images_test[i][x].shape != (0,) or reduced_noise[i][x].shape != (0,)):
    #             count=count+mean_squared_error(reduced_noise[i][x], images_test[i][x])
    # print(count)

    fvectors_test = images_to_feature_vectors(reduced_noise, bbox_size)

    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions(fvectors_test, model)
    return fvectors_test_reduced
Esempio n. 8
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    #get the eigenvector to get 20 principal components
    covx = np.cov(fvectors_train_full, rowvar=0)
    N = covx.shape[0]
    w, v = scipy.linalg.eigh(covx, eigvals=(N - 20, N - 1))
    v = np.fliplr(v)

    #put this eigenvector into the dictionary to use it again for test_data
    model_data['eigenvector'] = v.tolist()

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)
    
    #Tried 40 principal components but it seems not better
    '''
    d12=np.zeros(40)
    indices = 9, 25
    lowercase_list = list(string.ascii_lowercase)
    valid_characters = [i for j, i in enumerate(lowercase_list) if j not in indices]
    #extralist = ['l','’',',','.']
    #finlist =valid_characters+extralist
    for char1 in valid_characters:
        char1_data = fvectors_train[labels_train==char1, :]
        for char2 in valid_characters:
            char2_data = fvectors_train[labels_train==char2, :]
            d12 += divergence(char1_data, char2_data)

    sorted_indexes = np.argsort(-d12)
    features = sorted_indexes[0:10]
    model_data['features'] = features.tolist()

    fvector_train_final = fvectors_train[:, features]
    model_data['fvectors_train'] = fvector_train_final.tolist() for 40 principal
    '''
    
    model_data['fvectors_train'] = fvectors_train.tolist()
    
    return model_data
Esempio n. 9
0
def load_page(page_name, model):
    """Load raw test data page.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    fvectors_test = images_to_feature_vectors(images_test, bbox_size)
    return fvectors_test
Esempio n. 10
0
def load_test_page(page_name, model):
    """Load test data page.
    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.
    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    fvectors_test = images_to_feature_vectors(images_test, bbox_size)
    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions(fvectors_test, model, 1)
    return fvectors_test_reduced
Esempio n. 11
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """

    # Lecturer said that 'reading data' does not need to be modified
    # if you decide to let the letter be placed in the box from the left.
    # whereas it would need to be rewritten if you wishes to stretched the letter

    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    # Testing Reading Data
    print("**** print:", labels_train)
    print("shape of labels_train:",
          labels_train.shape)  #14395 labels read in as np.array
    print("length of images_train:",
          len(images_train))  # 14395 images read in as list

    # Extracts all features from training data - images --> featurevectors
    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    # list of np.array
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    # Testing Extracting Features
    print("--- fvectors_train_full", fvectors_train_full, "shape:",
          fvectors_train_full.shape)  # 2D Np array, shape - 14935, 2340
    print("++ fvectors_train_full[0]:", fvectors_train_full[0],
          "image dimensions:", fvectors_train_full[0].shape)
    print("++ fvectors_train_full[1]:", fvectors_train_full[1],
          "image dimensions:", fvectors_train_full[1].shape)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')
    # to be improved - reduce_dimensions
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
Esempio n. 12
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    #with open('words.txt') as f:
    #dictionary = [word.rstrip() for word in f]

    # Subtract mean from all data points
    datamean = np.mean(fvectors_train_full)
    centered = fvectors_train_full - datamean

    # Project points onto PCA axes
    fvectors = np.dot(centered, doPCA(fvectors_train_full, 40))

    # Get dictionary of words from text file
    dictionary = use_dictionary('words.txt')
    # Store W matrix from LDA
    model_data['lda'] = doLDA(fvectors, labels_train, 10).tolist()
    # Store PCA components into the model
    model_data['components'] = doPCA(fvectors_train_full, 40).tolist()
    # Create a new field for noise levels
    model_data['noise_levels'] = []
    # Add dictionary of words to the model
    model_data['dict'] = dictionary

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
Esempio n. 13
0
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)

    if bbox_size is None:
        bbox_size = get_bounding_box_size(images_test)

    bbox_height, bbox_width = bbox_size
    maximum_noise = 0
    # Calculate noise for this page
    for i, image in enumerate(images_test):
        padded_image = np.ones(bbox_size) * 255
        height, width = image.shape
        width = min(width, bbox_width)
        height = min(height, bbox_height)
        p_img = padded_image[0:height, 0:width]
        img = image[0:height, 0:width]
        p_img = img

        noise = get_estimateNoise(padded_image)
        if i == 0:
            maximum_noise = noise
        else:
            if noise > maximum_noise:
                maximum_noise = noise

    noise = model['noise_levels']
    # Add noise level estimates for each page
    noise.append(maximum_noise)
    noise = model['noise_levels']

    fvectors_test = images_to_feature_vectors(images_test, bbox_size)
    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions(fvectors_test, model)

    return fvectors_test_reduced
Esempio n. 14
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    #Create a dictionary to store and return results of training stage
    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')

    #Here I compute the eigenvectovs of the covariance matrix using the training data,
    #to compute the first 40 principal components
    covx = np.cov(fvectors_train_full, rowvar=0)
    N = covx.shape[0]
    w, v = scipy.linalg.eigh(covx, eigvals=(N - 40, N - 1))
    v = np.fliplr(v)

    #I then store the principal comonents "V"
    model_data['Principal_Components'] = v.tolist()

    #Gets a lsit of the ten chosen features and stores them in the dicitonarys
    model_data['features'] = choose_features(fvectors_train_full, model_data)
    print(model_data['features'])

    #Performs the dimentsionality reduiction of the training data
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    #Stores the training data after its dimensions have been reduced
    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.
        
    This function acts as the training stage. The images are loaded,
    noise is added to about a third of the data set and is then reduced
    to simualte the process of noise removal. These images are then
    turned into vectors and PCA is used to reduce the dimensions to 10.
    
    Params:
    train_page_names - list of training page names
    
    Returns:
    model_data - a dictionary that contains all the information
                 needed for the classification stage
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)

    print("Simulating noise removal")
    images = process_noise(images_train)
    fvectors_train_full = images_to_feature_vectors(images, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions via PCA')
    v = principal_components(fvectors_train_full, 11)[:, 1:11]
    model_data['v'] = v.tolist()
    model_data['mean'] = np.mean(fvectors_train_full).tolist()
    model_data['fvectors_train'] = np.dot(
        (fvectors_train_full - np.mean(fvectors_train_full)), v).tolist()

    print("Training has finished")
    return model_data
Esempio n. 16
0
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    fvectors_test = images_to_feature_vectors(images_test, bbox_size)

    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions(fvectors_test, model)
    
    fvectors_train = np.array(model['fvectors_train'])
    train_label = np.array(model['labels_train'])
    
    #I did the divergence step again to get the same feature columsn that i used for train data
    d12=np.zeros(20) #creating empty space for adding divergence below
    indices = 9, 25 #j and z that are not helpful for the divergence
    lowercase_list = list(string.ascii_lowercase)
    #remove j and z in the list
    valid_characters = [i for j, i in enumerate(lowercase_list) if j not in indices]
    #Tried to add some symbols for the divergence but no improvement
    #extralist = ['l','’',',','.']
    #finlist =valid_characters+extralist
    
    for char1 in valid_characters:
        char1_data = fvectors_train[train_label==char1, :]
        for char2 in valid_characters:
            char2_data = fvectors_train[train_label==char2, :]
            d12 += divergence(char1_data, char2_data)
    
    #Find the 10 best features with the divergence calculated above
    sorted_indexes = np.argsort(-d12)
    features = sorted_indexes[0:10]
    
    #should return 10 columns always
    return fvectors_test_reduced[:, features]
Esempio n. 17
0
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix. It also saves in the model
    a determine on whether a page was noisy or not. If a page was determined to be noisy,
    a median-filter is applied which has shown to make characters on a page more
    clear for the classifier.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    """
    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    fvectors_test = images_to_feature_vectors(images_test, bbox_size)
    # compute how noisy
    count = 0
    for i in range(fvectors_test.shape[0]):
        count += np.sum(fvectors_test[i])
    determine = count / (fvectors_test.shape[0] * fvectors_test.shape[1])
    # denoise images by applying median filter
    if (determine < 239.0):
        for i in range(fvectors_test.shape[0]):
            fvectors_test[i] = ndimage.median_filter(fvectors_test[i], 3)
    # save the fact it was noisy if it was
    if 'test_noisy' in model:
        x = np.array(model['test_noisy'])
        if (determine < 239.0):
            x = np.append(x, True)
        else:
            x = np.append(x, False)
        model['test_noisy'] = x.tolist()
    else:
        if (determine < 239.0):
            model['test_noisy'] = [True]
        else:
            model['test_noisy'] = [False]
    # Perform the dimensionality reduction.
    fvectors_test_reduced = reduce_dimensions(fvectors_test, model, "Test")
    return fvectors_test_reduced
Esempio n. 18
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    :param train_page_names: List of training page names
    :return: Dictionary storing the results
    """

    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    # # Add some noise to the training data (not much overall improvement)
    # for i in range(fvectors_train_full.shape[0]):
    #     noise = np.random.randint(80, size=fvectors_train_full.shape[1])
    #     fvectors_train_full[i][:] = np.add(fvectors_train_full[i][:], noise)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions_train(fvectors_train_full, model_data)
    model_data['fvectors_train'] = fvectors_train.tolist()

    print('Loading the word lists')
    dictionary = []
    with open('data/train/dictionary.txt', 'r') as f:
        for line in f:
            dictionary.append(line.strip('\n'))

    model_data['dict'] = dictionary

    return model_data
Esempio n. 19
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['bbox_size'] = bbox_size

    word_lists = [get_word_lists(filename) for filename in WORD_FILE_NAMES]
    model_data['word_lists'] = word_lists

    try:
        fvectors_train_full, labels_train = increase_training_size(fvectors_train_full, labels_train, model_data)
    except:
        print("Failed to increase training set size. Will proceed with base training set size.")

    print('Reducing to 10 dimensions')

    #fvectors_train_full, labels_train = artificially_increase_trainingset_size(fvectors_train_full,labels_train)


    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    model_data['labels_train'] = labels_train.tolist()

    return model_data
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.
    The eigenvalues are computed here on the training data to be used
    for the PCA dimension reduction. The same V value is stored in the
    model so that it is reused again on the training data and not being
    recomputed again.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)
    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size
    covx = np.cov(fvectors_train_full, rowvar=0)
    N = covx.shape[0]
    w, v = scipy.linalg.eigh(covx, eigvals=(N - 10, N - 1))
    computed_v = np.fliplr(v)
    model_data['computed_v'] = computed_v.tolist()
    # reading the words from the dictionary
    model_data['dictionary_words'] = [
        word for line in open("wordsEn.txt", 'r') for word in line.split()
    ]
    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
Esempio n. 21
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    # PCA adapted from the labs
    # Calculating the principal components
    covx = np.cov(fvectors_train_full, rowvar=0)
    N = covx.shape[0]
    w, v = scipy.linalg.eigh(covx, eigvals=(N - 40, N - 1))
    v = np.fliplr(v)

    # Storing the eigenvectors in dictionary
    model_data['eigenvectors'] = v.tolist()

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
Esempio n. 22
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size
    model_data['unique_ratio'] = []

    infile = open('../data/Extra/markov_pmatrix.pickle', 'rb')
    model_data['markov_states'] = pickle.load(infile)
    infile.close()

    wordFile = open('../data/Extra/wordlist.txt', 'r')
    model_data['words'] = [i.strip() for i in wordFile.readlines()]
    wordFile.close()

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()
    return model_data
def load_test_page(page_name, model):
    """Load test data page.

    This function must return each character as a 10-d feature
    vector with the vectors stored as rows of a matrix.

    Params:
    page_name - name of page file
    model - dictionary storing data passed from training stage
    
    Returns:
    fvectors_test_reduced - a 10-d feature vector with the vectors
    stored as rows of a matrix
    """

    bbox_size = model['bbox_size']
    images_test = utils.load_char_images(page_name)
    n = remove_noise(images_test)
    fvectors_test = images_to_feature_vectors(n, bbox_size)
    # Perform the dimensionality reduction.
    mean = np.array(model['mean'])
    v = np.array(model['v'])
    fvectors_test_reduced = np.dot((fvectors_test - mean), v)
    return fvectors_test_reduced
Esempio n. 24
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.
    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size
    # initialise empty array to later update with eigenvectors
    model_data['eigenvector'] = np.array([]).tolist() 
    
    # For PCA Dimension Reduction
    noise_dim = 50 # for noise reduction
    dim = 10 # for final dimension reduction
    
    model_data['noise_dim'] = noise_dim
    model_data['dim'] = dim


    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)   
    model_data['fvectors_train'] = fvectors_train.tolist()
    
    return model_data
Esempio n. 25
0
def process_training_data(train_page_names):
    """Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    images_train = []  #initialise empty lists for train data and train labels
    labels_train = []
    for page_name in train_page_names:
        #gets train data from model file using utils.py
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)

    #makes it to numpy array - 14395*1 vector
    labels_train = np.array(labels_train)  #numpy array of train labels

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(
        images_train, bbox_size)  #shape: (14395,2340)

    #model data is a dictionary, key value assosciations
    #2 initial keys: labels_train and bbox size
    model_data = dict()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size  #tuple (39,60)

    print('Extracting principal components')
    #store the principal components from the related function
    p_comp = principal_components(fvectors_train_full)

    #store principal components into the model by making them a list
    model_data['principal_comp'] = p_comp.tolist()

    #store external dictionary text file in the model
    #used for the error correction
    print('Getting the dictionary')
    with open('wiki-100k.txt', 'r') as wiki:
        words = wiki.readlines()
    words_length = [word.strip() for word in words]
    model_data['textFile'] = words_length

    #select the best features
    print("Select Best Features")
    selected_features = select_features(fvectors_train_full, model_data)
    model_data['selected_features'] = selected_features

    #fvectors_train_full has shape (14395, 2340)
    print('Initial dimensions: ' + str(fvectors_train_full.shape))

    print('Reducing to 10 dimensions')
    fvectors_train = reduce_dimensions(fvectors_train_full, model_data)

    #fvectors_train has shape (14395, 10)
    print('Reduced dimensions: ' + str(fvectors_train.shape))

    #store fvectors in model_data as list
    model_data['fvectors_train'] = fvectors_train.tolist()

    #return the model
    return model_data
Esempio n. 26
0
def process_training_data(train_page_names):
    """
    Perform the training stage and return results in a dictionary.

    Params:
    train_page_names - list of training page names
    """
    print('Reading data')
    images_train = []
    labels_train = []
    images_train_final = []
    for page_name in train_page_names:
        images_train = utils.load_char_images(page_name, images_train)
        labels_train = utils.load_labels(page_name, labels_train)

    # for every image, increase contrast and store these new images as
    # the images to use for training
    # for image in images_train:
    #     img_contr = increase_contrast_image(image, 150)
    #     images_train_final.append(img_contr)

    # images_train_final = np.array(images_train_final)
    labels_train = np.array(labels_train)

    print('Extracting features from training data')
    bbox_size = get_bounding_box_size(images_train)
    fvectors_train_full = images_to_feature_vectors(images_train, bbox_size)

    # take first half of full training vectors
    fvectors_train_fhalf = fvectors_train_full[:(
        math.floor((fvectors_train_full.shape[0]) / 2)), :]
    # create random 1D array with n features to be used as noise
    np.random.seed(2)
    noise = (np.random.rand(2340) * 100).astype(int)
    # add noise to half of training data images
    fvectors_train_fhalf = np.subtract(fvectors_train_fhalf, noise)

    # any pixel below 0 (black) set to 0
    for i in range(len(fvectors_train_fhalf) - 1):
        for j in range(len(fvectors_train_fhalf[0]) - 1):
            if fvectors_train_fhalf[i][j] < 0:
                fvectors_train_fhalf[i][j] = 0

    # for every noisy training images, apply a median filter to image to reduce
    # noise and store these new images as the images to use for training
    # (same filters applied to test images)
    fvectors_train_fhalf_final = []
    for vector in fvectors_train_fhalf:
        # img_contr = increase_contrast_vector(vector, 150) -- commented out as it reduces accuracy
        noise_red = ndimage.median_filter(vector, 3)
        fvectors_train_fhalf_final.append(noise_red)
    fvectors_train_fhalf_final = np.array(fvectors_train_fhalf_final)

    # recreate full training vectors by stacking noisy images with second half of original full training vectors
    fvectors_train_shalf = fvectors_train_full[(
        math.floor((fvectors_train_full.shape[0]) / 2)):, :]
    fvectors_train_full = np.vstack(
        (fvectors_train_fhalf_final, fvectors_train_shalf))

    model_data = dict()
    model_data['train_mean'] = np.mean(fvectors_train_full).tolist()
    model_data['labels_train'] = labels_train.tolist()
    model_data['bbox_size'] = bbox_size

    print('Reducing to 10 dimensions')

    # use PCA to get 40 eigenvectors of covariance matrix of all training vectors
    covx = np.cov(fvectors_train_full, rowvar=False)
    N = covx.shape[0]
    w, v = linalg.eigh(covx, eigvals=(N - 40, N - 1))
    v = np.fliplr(v)
    model_data['v'] = v.tolist()

    fvectors_train = reduce_dimensions_train(fvectors_train_full, model_data)

    model_data['fvectors_train'] = fvectors_train.tolist()

    return model_data