def vectorize(data_path, no_of_images, indicator): ''' Takes the path, total number of folders(not images coz folders may be empty) in path and creates vector matrix. Generlly called in Model.py file Input: Path to files, Total Images, print step(how many images till notification) Output: Final dataset to be used in modeling after PCA ,Index dictionary ''' # Since some of the images may not be loaded(dont exist or other errors) we # start a dictionary to keep track of which ones do and which ones dont index_dict = {} # dict_counter keeps a count of number of images successfully loaded dict_counter = 0 predata = [] for i in xrange(no_of_images): image_name = data_path + "{0}/{0}.jpg".format(i) if os.path.exists(image_name): # Get features features = preprocess(image_name) # Forming three matrices from all the images. these will be # combined later predata.append(features) index_dict[dict_counter] = i dict_counter += 1 else: #index_dict[i] = "Load Error:No image" pass if i % indicator == 0: print "%d images finished" % i # Convert the two lists to np arrays data = np.array(predata) print data print">>>>>>>>>>>>>>", data.shape pickle_this(data, "Final_Feature_Matrix_For_PCA.pkl") pickle_this(index_dict, "Image_model_Index_dict.pkl") # Perform PCA data = data_pca_pipeline(data) return data, index_dict