def copy_files_to_label_dirs(inp_path, out_path, labels_file, process = None): prep_out_path(out_path) for i in range(0, 5): os.makedirs(path.join(out_path, str(i))) labels = pd.read_csv(labels_file) existing_files = pd.DataFrame([path.splitext(f)[0] for f in os.listdir(inp_path)], columns =[labels.columns[0]]) existing_files = existing_files.merge(labels, on=existing_files.columns[0]) def processAndCopy(inp_file, out_file): im = cv2.imread(inp_file) im = process(im) cv2.imwrite(out_file, im) if process == None: cp = copy else: cp = processAndCopy for f, l in zip(existing_files['image'], existing_files['level']): file_name = path.join(out_path, str(l), f + ".jpeg") inp_file = path.join(inp_path, f + '.jpeg') cp(inp_file, file_name) print "copied {0} to {1}".format(inp_file, file_name)
def copy_files_to_label_dirs(inp_path, out_path, labels_file, process=None): prep_out_path(out_path) for i in range(0, 5): os.makedirs(path.join(out_path, str(i))) labels = pd.read_csv(labels_file) existing_files = pd.DataFrame( [path.splitext(f)[0] for f in os.listdir(inp_path)], columns=[labels.columns[0]]) existing_files = existing_files.merge(labels, on=existing_files.columns[0]) def processAndCopy(inp_file, out_file): im = cv2.imread(inp_file) im = process(im) cv2.imwrite(out_file, im) if process == None: cp = copy else: cp = processAndCopy for f, l in zip(existing_files['image'], existing_files['level']): file_name = path.join(out_path, str(l), f + ".jpeg") inp_file = path.join(inp_path, f + '.jpeg') cp(inp_file, file_name) print "copied {0} to {1}".format(inp_file, file_name)
def copy_train_files(inp_path, train_path, labels_file): prep_out_path(train_path) labels = pd.read_csv(labels_file) files_names = set(labels[labels.columns[0]].as_matrix()) all_files = set(os.listdir(inp_path)) train_files = all_files.intersection(files_names) for f in train_files: copy(path.join(inp_path, f), path.join(train_path, f))
def copy_test_files(inp_path, test_path, labels_file): prep_out_path(test_path) labels = pd.read_csv(labels_file) file_names = set(labels[labels.columns[0]].as_matrix()) all_files = set(os.listdir(inp_path)) test_files = all_files.difference(file_names) for f in test_files: copy(path.join(inp_path, f), path.join(test_path, f))
def get_areal_features(root, features_path, masks_dir, n_bins = 100): prep_out_path(features_path) files = os.listdir(root) df = pd.DataFrame(columns = range(n_bins * 2) + ['name', 'level']) names = pd.read_csv(labels_file) print "Starting extraction: ", time_now_str() for j, f in enumerate(files): label = names.loc[names['image'] == path.splitext(f)[0]] start = time.time() imr = ImageReader(root, f, masks_dir, gray_scale = True) drusen = get_predicted_region(imr.image, Labels.Drusen) blood = get_predicted_region(imr.image, Labels.Haemorage) Bc = np.ones((5, 5)) labels_drusen, n_drusen = mh.label(drusen, Bc) labels_blood, n_blood = mh.label(blood, Bc) area = float(cv2.countNonZero(imr.mask)) outp = np.array([], dtype = np.int) # sizes excluding background sizes_drusen = mhl.labeled_size(labels_drusen)[1:] / area sizes_blood = mhl.labeled_size(labels_blood)[1:] / area hist_druzen, _ = np.histogram(sizes_drusen, n_bins, (0, 1e-3)) hist_blood, _ = np.histogram(sizes_blood, n_bins, (0, 1e-3)) outp = np.r_[outp, hist_druzen] outp = np.r_[outp, hist_blood] outp = np.r_[outp, label.values[0]] df.loc[j] = outp print "Extracted: {0}, took {1:02.2f} sec ".format(f, time.time() - start) # write out the csv df.to_csv(path.join(features_path, prefix + ".csv"), index = False, header=True) print "Extracted: ", prefix, "@", time_now_str()
def copy_files_to_label_dirs(inp_path, out_path, labels_file): prep_out_path(out_path) labels = pd.read_csv(labels_file) splitter = labels.columns[1] dirs = np.unique(labels[splitter].as_matrix()) for dir in dirs: p = path.join(out_path, dir) os.makedirs(p) bad = [] for f, l in zip(labels[labels.columns[0]], labels[labels.columns[1]]): file_name = path.join(out_path, l, f) inp_file = path.join(inp_path, f) try: shutil.copy(inp_file, file_name) except IOError: print "Cannot copy: {0}".format(f) bad += [f] continue print "copied {0} to {1}".format(inp_file, file_name) print bad
from kobra.tr_utils import prep_out_path, time_now_str import os from os import path import mahotas as mh import mahotas.labeled as mhl import cv2 import time preprocessed = '/kaggle/retina/train/labelled' masks = '/kaggle/retina/train/masks' orig = '/kaggle/retina/train/sample/split' output = '/kaggle/retina/train/sample/features' n_bins = 100 prep_out_path(output) for i in range(0, 5): prefix = str(i) print "Starting extraction @ ", time_now_str() files = os.listdir(path.join(preprocessed, prefix)) # intermediate output will be stored here # we will save all the files first then join them into one csv file df = pd.DataFrame(columns = range(n_bins * 2 + 1)) j = 0 for f in files: start = time.time()
cv2.imwrite(out_im_name, toSave) return out_im_name def resize_only(image_name): image = cv2.imread(image_name) out_name = path.split(image_name)[1] out_im_name = path.join(out_path, out_name) toSave = cv2.resize(image, size) cv2.imwrite(out_im_name, toSave) def kmeans_only(image_name, K=10): out_im_name = get_output_name(image_name) image = cv2.imread(image_name) toSave, _, _ = kmeans(image, K) cv2.imwrite(out_im_name, toSave) prep_out_path(out_path) dv = Client().load_balanced_view() fs = dv.map(kmeans_only, np.array(image_paths)) print "Started: ", time_now_str() fs.wait() print "Finished: ", time_now_str()
from kobra.tr_utils import prep_out_path, time_now_str import os from os import path import mahotas as mh import mahotas.labeled as mhl import cv2 import time preprocessed = '/kaggle/retina/train/labelled' masks = '/kaggle/retina/train/masks' orig = '/kaggle/retina/train/sample/split' output = '/kaggle/retina/train/sample/features' n_bins = 100 prep_out_path(output) for i in range(0, 5): prefix = str(i) print "Starting extraction @ ", time_now_str() files = os.listdir(path.join(preprocessed, prefix)) # intermediate output will be stored here # we will save all the files first then join them into one csv file df = pd.DataFrame(columns=range(n_bins * 2 + 1)) j = 0 for f in files: start = time.time()