def main(in_folder, train_folder, val_folder, labels_file, scale, crop_shape, random_draws, valsplit): names = parse_folder(in_folder, "jpeg") print "Total number of files in %s: %d" % (in_folder, len(names)) pdlab = read_csv(labels_file, names=['image', 'label'], index_col='image', header=0) # Determine the train and val split pdlab = sample_train_val_split(names, pdlab, valsplit) # Create a parallel pool errf = open('err.log', 'w') sys.stderr = errf with Parallel(n_jobs=8) as parallel: rets = parallel(delayed(main_proc)( name, pdlab.ix[extract_filename_in_path(name)]['label'], train_folder if pdlab.ix[ extract_filename_in_path(name)][ 'val'] == 0 else val_folder, scale, crop_shape, random_draws, True if pdlab.ix[ extract_filename_in_path(name)][ 'val'] == 0 else False, False, i) for i, name in enumerate(names)) print "Done. A total of %d files processed." % len(rets) for f in glob.glob("*.log"): os.unlink(f) errf.close()
import numpy as np import matplotlib.pyplot as plt import cv2 # get_ipython().magic(u'matplotlib inline') # In[2]: from joblib import Parallel, delayed # In[3]: in_folder = '/media/shared/dr/DiabeticRetinopathy/train_orig/' names = np.asarray(parse_folder(in_folder, "jpeg")) # In[4]: num_samples = 20000 resize_size = (25, 25) train_split = 0.9 # In[5]: idx = np.arange(names.shape[0]) rng = np.random.RandomState(seed=1234) rng.shuffle(idx) X_names = names[idx[:num_samples]]