def process_data(info, features=['r', 'g', 'b', 'h', 's', 'v', 'fft_std', 'fft_max']): ''' processes images listed in a given info object into usable data Args: info (DataFrame): info object containing 'source', 'label' and 'params' columns features opt(list): list of features to include in the ouput data default: ['r', 'g', 'b', 'h', 's', 'v', 'fft_std', 'fft_max'] Returns: DataFrame: processed image data ''' # create data from info data = info.copy() data.reset_index(drop=True, inplace=True) data = data[['source', 'label', 'params']] err = data.source.tolist() data.source = data.source.apply(lambda x: PIL.Image.open(x)) data = data.apply(lambda x: generate_samples(x['source'], x['label'], x['params']), axis=1 ) # create new expanded dataframe data = list(chain(*data.tolist())) data = DataFrame(data, columns=['x', 'y', 'params']) data['bgr'] = data.x.apply(pil_to_opencv) del data['x'] # create feature lists rgb = filter(lambda x: x in list('rgb'), features) hsv = filter(lambda x: x in list('hsv'), features) fft = filter(lambda x: x in ['fft_std', 'fft_max'], features) # rgb distributions if rgb: temp = data[['bgr', 'params']].apply(lambda x: (x['bgr'], x['params']), axis=1) for chan in rgb: chan_data = temp.apply(lambda x: get_channel_histogram(x[0], chan, **x[1])) # data[chan] = chan_data.apply(lambda x: x.tolist()) create_histogram_stats(data, chan_data, chan) # hsv distributions if hsv: try: data['hsv'] = data.bgr.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_BGR2HSV)) except: print(err) raise SyntaxError temp = data[['hsv', 'params']].apply(lambda x: (x['hsv'], x['params']), axis=1) for chan in hsv: chan_data = temp.apply(lambda x: get_channel_histogram(x[0], chan, **x[1])) # data[chan] = chan_data.apply(lambda x: x.tolist()) create_histogram_stats(data, chan_data, chan) del data['hsv'] # grain frequency if fft: data['gray'] = data.bgr.apply(lambda x: cv2.cvtColor(x, cv2.COLOR_BGR2GRAY)) data.gray = data.gray.apply(lambda x: np.fft.hfft(x).astype(float)) data.gray = data.gray.apply(lambda x: np.histogram(x.ravel(), bins=256)[0]) if 'fft_std' in fft: data['fft_std'] = data.gray.apply(lambda x: x.std()) if 'fft_max' in fft: data['fft_max'] = data.gray.apply(lambda x: x.max()) del data['gray'] del data['bgr'] del data['params'] # shuffle data to destroy serial correlations index = data.index.tolist() np.random.shuffle(index) data = data.ix[index] data.reset_index(drop=True, inplace=True) # Normalize features cols = data.drop('y', axis=1).columns.tolist() ss = StandardScaler() data[cols] = ss.fit_transform(data[cols]) return data