def model(data, test_image_vector, distance_metric="cosine"): """ Perform Modeling on data by sending the image through the pipeline. The test vector is a random vector used to verify if code is functioning. Its value is assigned at the bottom of this file Input: final data , test vector to get results ,distanc metric to be used Output: predicted values for test vector """ model = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="brute") model.fit(data) pickle_this(model, "2knn_model.pkl") # Predict K nearest neighbors for the given vector predicted_indices = model.kneighbors(test_image_vector, return_distance=False) return predicted_indices
def vectorize(data_path, no_of_images, indicator): ''' Takes the path, total number of folders(not images coz folders may be empty) in path and creates vector matrix. Generlly called in Model.py file Input: Path to files, Total Images, print step(how many images till notification) Output: Final dataset to be used in modeling after PCA ,Index dictionary ''' # Since some of the images may not be loaded(dont exist or other errors) we # start a dictionary to keep track of which ones do and which ones dont index_dict = {} # dict_counter keeps a count of number of images successfully loaded dict_counter = 0 predata = [] for i in xrange(no_of_images): image_name = data_path + "{0}/{0}.jpg".format(i) if os.path.exists(image_name): # Get features features = preprocess(image_name) # Forming three matrices from all the images. these will be # combined later predata.append(features) index_dict[dict_counter] = i dict_counter += 1 else: #index_dict[i] = "Load Error:No image" pass if i % indicator == 0: print "%d images finished" % i # Convert the two lists to np arrays data = np.array(predata) print data print">>>>>>>>>>>>>>", data.shape pickle_this(data, "Final_Feature_Matrix_For_PCA.pkl") pickle_this(index_dict, "Image_model_Index_dict.pkl") # Perform PCA data = data_pca_pipeline(data) return data, index_dict
def pca( data, n_components=100, filename="pca_image_model.pkl", plot=False, mayipickle=False): ''' Since Standard scalar expects <=2 dimensions but colored images have 3 we will be convert them to gray if not gray already Input: Gray image, number of components you wish to use, filename for pickled pca object, plot boolean to do the scree plot, boolean for deciding to pickle object or not Output: Features from image ''' # Since standard scalar works with floats we change dtype here. # Else we get a warning data = data.astype("float64") scale = StandardScaler() img_data_scaled = scale.fit_transform(data) print "Scaling the data", img_data_scaled pickle_this(scale, "SS_model.pkl") print "Performing PCA on scaled data" # features pca_model = PCA(n_components) pca_data = pca_model.fit_transform(img_data_scaled) if mayipickle: pickle_this(pca_model, filename) # print ">>>>>>>>>>>>>after>>>>>>>", pca_data.shape if plot: # PLot to see how much variance does each principal component explain scree_plot(pca_model) plt.show() variance_array = pca_model.explained_variance_ratio_ return pca_data, variance_array