Ejemplo n.º 1
0
def model(data, test_image_vector, distance_metric="cosine"):
    """
    Perform Modeling on data by sending the image through the pipeline. The test vector
    is a random vector used to verify if code is functioning. Its value is assigned at the
    bottom of this file

    Input: final data , test vector to get results ,distanc metric to be used
    Output: predicted values for test vector
    """
    model = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="brute")
    model.fit(data)
    pickle_this(model, "2knn_model.pkl")
    # Predict K nearest neighbors for the given vector
    predicted_indices = model.kneighbors(test_image_vector, return_distance=False)
    return predicted_indices
def vectorize(data_path, no_of_images, indicator):
    '''
    Takes the path, total number of folders(not images coz folders may be empty)
    in path and creates vector matrix. Generlly called in  Model.py file
    Input: Path to files, Total Images, print step(how many images till notification)
    Output: Final dataset to be used in modeling after PCA ,Index dictionary
    '''

    # Since some of the images may not be loaded(dont exist or other errors) we
    # start a dictionary to keep track of which ones do and which ones dont
    index_dict = {}
    # dict_counter keeps a count of number of images successfully loaded
    dict_counter = 0

    predata = []

    for i in xrange(no_of_images):
        image_name = data_path + "{0}/{0}.jpg".format(i)

        if os.path.exists(image_name):
            # Get features
            features = preprocess(image_name)

            # Forming three matrices from all the images. these will be
            # combined later
            predata.append(features)

            index_dict[dict_counter] = i
            dict_counter += 1
        else:
            #index_dict[i] = "Load Error:No image"
            pass

        if i % indicator == 0:
            print "%d images finished" % i

    # Convert the two lists to np arrays
    data = np.array(predata)
    print data
    print">>>>>>>>>>>>>>", data.shape

    pickle_this(data, "Final_Feature_Matrix_For_PCA.pkl")
    pickle_this(index_dict, "Image_model_Index_dict.pkl")

    # Perform PCA
    data = data_pca_pipeline(data)

    return data, index_dict
def pca(
        data,
        n_components=100,
        filename="pca_image_model.pkl",
        plot=False,
        mayipickle=False):
    '''
    Since Standard scalar expects <=2 dimensions but colored images have 3 we will be
    convert them to gray if not gray already

    Input: Gray image, number of components you wish to use, filename for pickled pca
    object, plot boolean to do the scree plot, boolean for deciding to pickle object or not

    Output: Features from image

    '''
    # Since standard scalar works with floats we change dtype here.
    # Else we get a warning

    data = data.astype("float64")

    scale = StandardScaler()
    img_data_scaled = scale.fit_transform(data)
    print "Scaling the data", img_data_scaled
    pickle_this(scale, "SS_model.pkl")
    print "Performing PCA on scaled data"
    # features
    pca_model = PCA(n_components)
    pca_data = pca_model.fit_transform(img_data_scaled)

    if mayipickle:
        pickle_this(pca_model, filename)

    # print ">>>>>>>>>>>>>after>>>>>>>", pca_data.shape
    if plot:
        # PLot to see how much variance does each principal component explain
        scree_plot(pca_model)
        plt.show()

    variance_array = pca_model.explained_variance_ratio_
    return pca_data, variance_array