def load_data(datasetpath, preprocessors): # grab the list of images that we’ll be describing print("[INFO] loading images...") imagePaths = list(paths.list_images(datasetpath)) # load the dataset from disk then scale the raw pixel intensities # to the range [0, 1] sdl = SimpleDatasetLoader(preprocessors=preprocessors) (data, labels) = sdl.load(imagePaths, verbose=500) data = data.astype("float") / 255.0 # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split( data, labels, test_size=0.25, random_state=42) # convert the labels from integers to vectors lb = LabelBinarizer() lb.fit(labels) trainY = lb.transform(trainY) testY = lb.transform(testY) classNames = lb.classes_ if len(classNames) < 3: trainY = np.hstack((trainY, 1 - trainY)) testY = np.hstack((testY, 1 - testY)) return ((trainX, trainY), (testX, testY), classNames)
def split_17flowers(traindir): for dir_id in range(17): os.makedirs(os.path.join(traindir, 'dir_'+str(dir_id)), exist_ok=True) imagepaths = [(f, os.path.basename(f)) for f in paths.list_images(traindir)] imagepaths = [(f, os.path.join(traindir, 'dir_'+str((int(i)-1)//80), n)) for (f, n) in imagepaths for i in re.findall('(\d{4})', n)] for (f, fn) in imagepaths: os.rename(f, fn)
def split_dog_cat_image_files(traindir): catdir = os.path.join(traindir, 'cat') dogdir = os.path.join(traindir, 'dog') os.makedirs(catdir, exist_ok=True) os.makedirs(dogdir, exist_ok=True) imagepaths = [(f, os.path.basename(f)) for f in paths.list_images(traindir)] imagepaths = [(f, os.path.join(dogdir if n.startswith('dog') else catdir, n)) for (f, n) in imagepaths] for (f, fn) in imagepaths: os.rename(f, fn)
def extract_features(dataset_path, output_file, buffer_size, batch_size): bs = batch_size imagePaths = list(paths.list_images(dataset_path)) random.shuffle(imagePaths) print("[INFO] number of images... {}".format(len(imagePaths))) # extract the class labels from the image paths then encode the # labels labels = [p.split(os.path.sep)[-2] for p in imagePaths] le = LabelEncoder() labels = le.fit_transform(labels) # initialize the HDF5 dataset writer, then store the class label # names in the dataset feature_size = 512 * 7 * 7 dataset = HDF5DatasetWriter((len(imagePaths), feature_size), output_file, dataKey="features", bufSize=buffer_size) dataset.storeClassLabels(le.classes_) # load the VGG16 network print("[INFO] loading network...") model = VGG16(weights="imagenet", include_top=False) image_size = (244, 244) for i, (batchLabels, batchImages) in enumerate( get_data_(imagePaths, labels, bs, image_size)): # pass the images through the network and use the outputs as # our actual features bs = len(batchImages) print("[INFO] processing batch... {}/{}".format(i, bs)) batchImages = np.vstack(batchImages) features = model.predict(batchImages, batch_size=bs) # reshape the features so that each image is represented by # a flattened feature vector of the ‘MaxPooling2D‘ outputs features = features.reshape((features.shape[0], feature_size)) # add the features and labels to our HDF5 dataset dataset.add(features, batchLabels) # close the dataset dataset.close() print("[INFO] processing completed...")
def getData(self, datasetpath): # initialize the class labels self.classLabels = ["cat", "dog", "panda"] # grab the list of images in the dataset then randomly sample # indexes into the image paths list print("[INFO] sampling images...") self.imagePaths = np.array(list(paths.list_images(datasetpath))) idxs = np.random.randint(0, len(self.imagePaths), size=(10,)) self.imagePaths = self.imagePaths[idxs] # initialize the image preprocessors sp = SimplePreprocessor(32, 32) iap = ImageToArrayPreprocessor() # load the dataset from disk then scale the raw pixel intensities # to the range [0, 1] sdl = SimpleDatasetLoader(preprocessors=[sp, iap]) (self.data, self.labels) = sdl.load(self.imagePaths) self.data = self.data.astype("float") / 255.0
from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from tnmlearn.preprocessing import SimplePreprocessor from tnmlearn.datasets import SimpleDatasetLoader from tnmlearn.other import paths import argparse # construct the argument parse and parse the arguments # C:\Users\hbad483\Documents\Anaconda\datasets\animals\train ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") args = vars(ap.parse_args()) # grab the list of image paths print("[INFO] loading images...") imagePaths = list(paths.list_images(args["dataset"])) sp = SimplePreprocessor(32, 32) sdl = SimpleDatasetLoader(preprocessors=[sp]) (data, labels) = sdl.load(imagePaths, verbose=500) data = data.reshape((data.shape[0], 3072)) # encode the labels as integers le = LabelEncoder() labels = le.fit_transform(labels) # partition the data into training and testing splits using 75% of # the data for training and the remaining 25% for testing (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25,