def readin(self): """readin() Read in image data from a directory. """ # Read in the initialization data (images) from initdirname, if present. # This variable is called 'initfilename', but it's a directory here. if self.initfilename != '': printt('Reading initialization data set from %s' % self.initfilename) (self.initdata, unused_labels, width, height) = ImageData.read_image_dir(self.initfilename) self.initdata = np.asarray(self.initdata) self.initdata = self.initdata.T print self.initdata.shape ########## Read in the data to analyze # Labels are individual filenames (self.data, self.labels, self.width, self.height) = \ ImageData.read_image_dir(self.filename) self.data = np.asarray(self.data) if len(self.data) == 0: print 'Error: no image files found.' sys.exit(1) print 'Read %d image files with %d pixels each.' % self.data.shape self.data = self.data.T print self.data.shape print ' Dimensions: %d width, %d height.' % (self.width, self.height)
def readin(self): """readin() Read in image data from a directory. """ # Read in the initialization data (images) from initdirname, if present. # This variable is called 'initfilename', but it's a directory here. if self.initfilename != '': printt('Reading initialization data set from %s' % self.initfilename) (self.initdata, unused_labels, imshape) = \ ImageData.read_image_dir(self.initfilename) self.initdata = np.asarray(self.initdata) self.initdata = self.initdata.T print 'Initializing with %d images (%s).' % \ (self.initdata.shape[1], str(imshape)) print self.initdata.shape ########## Read in the data to analyze # Labels are individual filenames (self.data, self.labels, self.imshape) = \ ImageData.read_image_dir(self.filename) self.data = np.asarray(self.data) print self.data.shape if len(self.data) == 0: print 'Error: no image files found.' sys.exit(1) self.data = self.data.T print 'Read %d images (%s).' % \ (self.data.shape[1], str(self.imshape))
def __init__(self, inputname=None, initfilename=None, startsol=-1, endsol=-1, initpriorsols=False, shotnoisefilt=0): """LIBSData(inputname="", sol=-1) Read in LIBS (ChemCam) data in CSV format from inputname. If inputname ends in .csv, treat it as a CSV file. If inputname ends in .pkl, treat it as a pickled file. Otherwise, treat it as a directory and look for a .pkl file inside; if not found, generate it with contents from all .csv files present. If present, also read in data from initfilename (must be .csv). This data will be used to initialize the DEMUD model. Optionally, specify a sol range (startsol-endsol) for data to analyze. Optionally, use data prior to startsol to initialize the model. Optionally, specify the width of a median filter to apply. """ input_type = inputname[-3:] if input_type == 'csv': filename = inputname expname = 'libs-' + \ os.path.splitext(os.path.basename(filename))[0] #filename[filename.rfind('/')+1:filename.find('.')] elif input_type == 'pkl': if shotnoisefilt > 0: #filename = inputname[:-4] + ('-snf%d.pkl' % shotnoisefilt) filename = os.path.splitext(inputname)[0] + \ ('-snf%d.pkl' % shotnoisefilt) else: filename = inputname expname = 'libs-' + \ os.path.splitext(os.path.basename(filename))[0] #filename[filename.rfind('/')+1:filename.find('.')] else: # assume directory input_type = 'dir' #filename = inputname + '/libs-mean-norm.pkl' filename = os.path.join(inputname, 'libs-mean-norm.pkl') if shotnoisefilt > 0: #filename = filename[:-4] + ('-snf%d.pkl' % shotnoisefilt) filename = os.path.splitext(inputname)[0] + \ ('-snf%d.pkl' % shotnoisefilt) #expname = 'libs-' + inputname[inputname.rfind('/')+1:] expname = 'libs-' + os.path.basename(inputname) Dataset.__init__(self, filename, expname, initfilename) printt('Reading %s data from %s.' % (input_type, self.filename)) if input_type == 'dir' and not os.path.exists(filename): LIBSData.read_dir(inputname, filename, shotnoisefilt) self.readin(startsol, endsol, initpriorsols, shotnoisefilt)
def medfilter(cls, data, L, fw=[]): """medfilter(cls, data, L) Filter each column of data using a window of width L. Replace each value with its median from the surrounding window. Inspired by http://staff.washington.edu/bdjwww/medfilt.py . Optionally, specify feature weights so they can factor in to the median calculation. Any zero-valued weights make the median calculation ignore those items. Values greater than zero are NOT weighted; they all participate normally. """ if data == []: print 'Error: empty data; cannot filter.' return data if L < 3: print 'Error: L (%d) is too small; minimum 3.' % L return data printt( 'Filtering shot noise with a width of %d (this may take some time).' % L) Lwing = (L - 1) / 2 (d, n) = data.shape # assume items are column vectors data2 = np.zeros_like(data) for j in range(n): for i in range(d): # Specify the range over which to compute the median if (i < Lwing): ind = range(0, i + Lwing + 1) elif (i >= d - Lwing): ind = range(i - Lwing, d) else: ind = range(i - Lwing, i + Lwing + 1) # If featureweights are specified, # adjust ind to only include the nonzero ones. if fw != []: # If there aren't any features with nonzero weights, # this won't use anything (set data value to 0) ind = [i for i in ind if fw[i] > 0] # Perform the median filter. # If there are no valid features to use, set this point to 0 # (it won't be used later anyway) if ind == []: data2[i, j] = 0 else: data2[i, j] = np.median(data[ind, j]) return data2
def medfilter(cls, data, L, fw=[]): """medfilter(cls, data, L) Filter each column of data using a window of width L. Replace each value with its median from the surrounding window. Inspired by http://staff.washington.edu/bdjwww/medfilt.py . Optionally, specify feature weights so they can factor in to the median calculation. Any zero-valued weights make the median calculation ignore those items. Values greater than zero are NOT weighted; they all participate normally. """ if data == []: print 'Error: empty data; cannot filter.' return data if L < 3: print 'Error: L (%d) is too small; minimum 3.' % L return data printt('Filtering shot noise with a width of %d (this may take some time).' % L) Lwing = (L-1)/2 (d,n) = data.shape # assume items are column vectors data2 = np.zeros_like(data) for j in range(n): for i in range(d): # Specify the range over which to compute the median if (i < Lwing): ind = range(0, i+Lwing+1) elif (i >= d - Lwing): ind = range(i-Lwing, d) else: ind = range(i-Lwing, i+Lwing+1) # If featureweights are specified, # adjust ind to only include the nonzero ones. if fw != []: # If there aren't any features with nonzero weights, # this won't use anything (set data value to 0) ind = [i for i in ind if fw[i]>0] # Perform the median filter. # If there are no valid features to use, set this point to 0 # (it won't be used later anyway) if ind == []: data2[i, j] = 0 else: data2[i, j] = np.median(data[ind, j]) return data2
def read_image_dir(cls, dirname): """read_image_dir(dirname) Read in all of the images in dirname and return - a list of data - a list of labels - imshape: (width, height) or (width, height, depth) tuple """ data = [] labels = [] # Save the individual file names imshape = (-1, -1, -1) # Read in the image data files = sorted(os.listdir(dirname)) numimages = len(os.listdir(dirname)) print numimages printt("Loading files:") counter = 0 for idx, f in enumerate(files): # Unix-style wildcards. if (fnmatch.fnmatch(f, '*.jpg') or fnmatch.fnmatch(f, '*.JPG') or fnmatch.fnmatch(f, '*.png')): # Read in the image filename = dirname + '/' + f im = imread(filename) if imshape[0] == -1: #data = np.zeros([], dtype=np.float32).reshape(numimages, np.prod(im.shape)) data = np.zeros([numimages, np.prod(im.shape)], dtype=np.float32) #data = np.array([], dtype=np.float32).reshape(0,np.prod(im.shape)) imshape = im.shape else: # Ensure that all images are the same dimensions if imshape != im.shape: if len(im.shape) == 2: # Convert grayscale to rgb im = np.dstack((im, im, im)) else: raise ValueError( 'Images must all have the same dimensions.') #data = np.vstack([data, im.reshape(1,np.prod(im.shape))]) data[counter] = im.reshape(1, np.prod(im.shape)) labels.append(f) progbar(idx, len(files)) counter += 1 return (data, labels, imshape)
def readin(self, nskip): """readin() """ (self.xvals, self.data, self.labels) = FloatDataset.read_csv(self.filename, nskip) self.data = self.data.T # features x samples # If there was no header with feature names, just create an empty xvals if self.xvals == []: self.xvals = numpy.arange(self.data.shape[0]).reshape(-1, 1) # Read in the init data file, if present if self.initfilename != '': printt('Reading initialization data set from %s' % self.initfilename) (_, self.initdata, _) = FloatDataset.read_csv(self.initfilename, nskip) self.initdata = self.initdata.T # features x samples
def prune_and_normalize(cls, data, wavelengths, shotnoisefilt): """prune_and_normalize(cls, data, wavelengths, shotnoisefilt) Subset LIBS data to only use wavelengths below 850 nm, set negative values to zero, then normalize responses for each of the three spectrometers. If shotnoisefilt >= 3, run a median filter on the data with width as specified. Return the pruned and normalized data. """ print 'Pruning and normalizing the data.' # Only use data between 270 and 820 nm (ends are noisy) use = np.where(np.logical_and(wavelengths >= 270, wavelengths < 820))[0] # Instead of stripping these bands out now, first do shot noise # filtering (if desired). Then strip the bands later. # If desired, apply a median filter to strip out impulsive noise # Note: this is slow. :) Probably can be optimized in some awesome fashion. if shotnoisefilt >= 3: printt('Filtering out shot noise with width %d.' % shotnoisefilt) # Create a vector of 0's (to ignore) and 1's (to use). fw = np.zeros((data.shape[0], 1)) fw[use] = 1 data = LIBSData.medfilter(data, shotnoisefilt, fw) # Now remove bands we do not use wavelengths = wavelengths[use] data = data[use, :] # Replace negative values with 0 negvals = (data < 0) printt('--- %d negative values.' % len(np.where(negvals)[0])) data[np.where(data < 0)] = 0 printt('--- %d negative values.' % len(np.where(data < 0)[0])) # Normalize the emission values for each of the # three spectrometers independently # Nina: VIS begins at 382.13812; VNIR starts at 473.1842 vis_spec = 382.13812 vnir_spec = 473.1842 spec1 = np.where( np.logical_and(wavelengths >= 0, wavelengths < vis_spec))[0] spec2 = np.where( np.logical_and(wavelengths >= vis_spec, wavelengths < vnir_spec))[0] spec3 = np.where(wavelengths >= vnir_spec)[0] for waves in [spec1, spec2, spec3]: data[waves, :] = data[waves, :] / np.sum(data[waves, :], axis=0) return (data, wavelengths)
def prune_and_normalize(cls, data, wavelengths, shotnoisefilt): """prune_and_normalize(cls, data, wavelengths, shotnoisefilt) Subset LIBS data to only use wavelengths below 850 nm, set negative values to zero, then normalize responses for each of the three spectrometers. If shotnoisefilt >= 3, run a median filter on the data with width as specified. Return the pruned and normalized data. """ print 'Pruning and normalizing the data.' # Only use data between 270 and 820 nm (ends are noisy) use = np.where(np.logical_and(wavelengths >= 270, wavelengths < 820))[0] # Instead of stripping these bands out now, first do shot noise # filtering (if desired). Then strip the bands later. # If desired, apply a median filter to strip out impulsive noise # Note: this is slow. :) Probably can be optimized in some awesome fashion. if shotnoisefilt >= 3: printt('Filtering out shot noise with width %d.' % shotnoisefilt) # Create a vector of 0's (to ignore) and 1's (to use). fw = np.zeros((data.shape[0],1)) fw[use] = 1 data = LIBSData.medfilter(data, shotnoisefilt, fw) # Now remove bands we do not use wavelengths = wavelengths[use] data = data[use,:] # Replace negative values with 0 negvals = (data < 0) printt('--- %d negative values.' % len(np.where(negvals)[0])) data[np.where(data < 0)] = 0 printt('--- %d negative values.' % len(np.where(data<0)[0])) # Normalize the emission values for each of the # three spectrometers independently # Nina: VIS begins at 382.13812; VNIR starts at 473.1842 vis_spec = 382.13812 vnir_spec = 473.1842 spec1 = np.where(np.logical_and(wavelengths >= 0, wavelengths < vis_spec))[0] spec2 = np.where(np.logical_and(wavelengths >= vis_spec, wavelengths < vnir_spec))[0] spec3 = np.where(wavelengths >= vnir_spec)[0] for waves in [spec1, spec2, spec3]: data[waves,:] = data[waves,:] / np.sum(data[waves,:], axis=0) return (data, wavelengths)
def read_dir(cls, dirname, outfile, shotnoisefilt=0): """read_dir(dirname, outfile) Read in raw LIBS data from .csv files in dirname. Pickle the result and save it to outfile. Note: does NOT update object fields. Follow this with a call to readin(). """ # First read in the target names and sol numbers. targets = {} sols = {} # Location of this file is hard-coded! # My latest version goes through sol 707. metafile = 'msl_ccam_obs.csv' with open(os.path.join(dirname, metafile)) as f: datareader = csv.reader(f) # Index targets, sols by spacecraft clock value for row in datareader: [sol, edr_type, sclk, target] = [row[i] for i in [0,1,2,5]] if edr_type != 'CL5': continue prior_targets = [t for t in targets.values() if target in t] n_prior = len(prior_targets) # Add 1 so shots are indexed from 1, not 0 targets[sclk] = target + ':%d' % (n_prior + 1) sols[sclk] = sol print 'Read %d target names from %s.' % (len(targets), metafile) print 'Now reading LIBS data from %s.' % dirname data = [] labels = [] wavelengths = [] files = os.listdir(dirname) f_ind = 0 # for f in files[:len(files)]: # Select only CSV files # if fnmatch.fnmatch(f, 'CL5_*.csv'): for f in fnmatch.filter(files, 'CL5_*.csv') + \ fnmatch.filter(files, 'cl5_*.csv'): # Extract site_drive_seqid from the filename filename = f[f.rfind('/')+1:] printt(' Processing %s.' % filename) sclk = filename[4:13] site = filename[18:21] drive = filename[21:25] seqid = filename[29:34] target = targets[sclk] sol = sols[sclk] # If it's a cal target, skip it if 'Cal Target' in target: print 'Skipping %s' % target continue #site_drive_seqid_target = '%s_%s_%s_%s' % (site, drive, seqid, target) #drive_sclk_target = '%s_%s_%s' % (drive, sclk, target) sol_sclk_target = 'Sol%s_%s_%s' % (sol, sclk, target) print(' Spacecraft clock %s, identifier %s.' % \ (sclk, sol_sclk_target)) with open(os.path.join(dirname, f), 'r') as csvfile: datareader = csv.reader(csvfile) row = datareader.next() while row[0][0] == '#': # Save the last row (comment line) lastrow = row row = datareader.next() # The last comment line contains the header strings # starting with 'wave' or 'nm' mylabels = [l.strip() for l in lastrow] mydata = [[float(x) for x in row]] for row in datareader: # Skip over empty lines if row[0] == '': continue mydata += [[float(x) for x in row]] mydata = np.array(mydata) # Store the wavelengths waveind = [ind for ind,name in enumerate(mylabels) \ if 'wave' in name] if len(waveind) != 1: printt('Expected 1 match on "wave"; got %d.' % len(waveind)) sys.exit(1) mywaves = mydata[:,waveind[0]] # Keep only the shots #shots = [ind for ind,name in enumerate(mylabels) \ # if 'shot' in name] # Keep only the mean shots = [ind for ind,name in enumerate(mylabels) \ if 'mean' in name] #myshotnames = ['%s_%d_%s' % (site_drive_sclk_target, # f_ind, mylabels[shot]) myshotnames = ['%s_%s' % (sol_sclk_target, mylabels[shot]) for shot in shots] mydata = mydata[:,[l for l in shots]] printt(' Read %d new items, %d features.' % mydata.shape[::-1]) if wavelengths != [] and np.any(wavelengths != mywaves): printt('Error: wavelengths in file %d do not match previous.' % f_ind) if f_ind == 0: data = mydata wavelengths = mywaves else: data = np.concatenate((data, mydata),1) labels += myshotnames f_ind = f_ind + 1 printt('Total so far: %d items, %d files.' % (data.shape[1], f_ind)) print if data == []: printt('No data files found, exiting.') sys.exit() printt('Read a total of %d items, %d features.' % data.shape[::-1]) labels = np.array(labels) # Prune and normalize (data, wavelengths) = LIBSData.prune_and_normalize(data, wavelengths, shotnoisefilt) printt('Saving to %s.' % outfile) outf = open(outfile, 'w') pickle.dump((data, labels, wavelengths), outf) outf.close() print 'Done!'
def filter_data(self, data, labels): """filter_data(data, labels) Filter out bad quality data, using criteria provided by Nina Lanza: 1) Large, broad features (don't correspond to narrow peaks) 2) Low SNR For each item thus filtered, write out a plot of the data with an explanation: 1) Annotate in red the large, broad feature, or 2) Annotate in text the SNR. Returns updated (filtered) data and label arrays. """ n = data.shape[1] newdata = data remove_ind = [] printt("Filtering out data with large, broad features.") #for i in [78]: # test data gap #for i in [1461]: # test broad feature #for i in [3400]: # test broad feature for i in range(n): waves = range(data.shape[0]) this_data = data[waves,i] peak_ind = this_data.argmax() peak_wave = self.xvals[waves[peak_ind]] # Set min peak to examine as 30% of max min_peak = 0.15 * this_data[peak_ind] # Track red_waves: indices of bands that contribute to deciding # to filter out this item (if indeed it is). # These same wavelengths will be removed from further consideration # regardless of filtering decision red_waves = [] # Iterate over peaks sufficiently big to be of interest while this_data[peak_ind] >= min_peak: #print "%d) Max peak: %f nm (index %d, %f)" % (i, # self.xvals[waves[peak_ind]], # peak_ind, # this_data[peak_ind]) red_waves = [waves[peak_ind]] # Set the low value to look for (indicates nice narrow peak) low_value = 0.1 * this_data[peak_ind] filter_item = True # guilty until proven innocent # Note: band resolution/spacing is not the same for diff ranges? # Sweep left and right up to 400 bands (10 nm), looking for low_value min_wave_ind = peak_ind max_wave_ind = peak_ind for j in range(1,401): min_wave_ind = max(min_wave_ind-1, 0) max_wave_ind = min(max_wave_ind+1, len(waves)-1) red_waves += [waves[min_wave_ind]] red_waves += [waves[max_wave_ind]] # If there's a data gap, ignore it if ((self.xvals[waves[min_wave_ind]+1] - self.xvals[waves[min_wave_ind]]) > 1): min_wave_ind += 1 if ((self.xvals[waves[max_wave_ind]] - self.xvals[waves[max_wave_ind]-1]) > 1): max_wave_ind -= 1 # Stop if we've gone more than 10 nm if (((self.xvals[waves[peak_ind]] - self.xvals[waves[min_wave_ind]]) > 10) or ((self.xvals[waves[max_wave_ind]] - self.xvals[waves[peak_ind]]) > 10)): filter_item = True #print '%.2f: %.2f to %.2f' % (self.xvals[waves[peak_ind]], # self.xvals[waves[min_wave_ind]], # self.xvals[waves[max_wave_ind]]) break #print 'checking %f, %f' % (self.xvals[waves[min_wave_ind]], # self.xvals[waves[max_wave_ind]]) if this_data[min_wave_ind] <= low_value or \ this_data[max_wave_ind] <= low_value: # success! data is good #print ' %f: %f' % (self.xvals[waves[min_wave_ind]], # this_data[min_wave_ind]) #print ' %f: %f' % (self.xvals[waves[max_wave_ind]], # this_data[max_wave_ind]) filter_item = False break # Remove the wavelengths we've considered [waves.remove(w) for w in red_waves if w in waves] # Filter the item out if filter_item: print "Filter item %d (%s) due to [%.2f, %.2f] nm " % (i, labels[i], self.xvals[min(red_waves)], self.xvals[max(red_waves)]) # record it for later removal remove_ind += [i] ''' # generate a plot, highlighting the problematic feature in red_waves pylab.clf() pylab.plot(self.xvals, data[:,i], 'k-', linewidth=1) pylab.plot(self.xvals[min(red_waves):max(red_waves)+1], data[min(red_waves):max(red_waves)+1,i], 'r-', linewidth=1) pylab.xlabel(self.xlabel, fontsize=16) pylab.ylabel(self.ylabel, fontsize=16) pylab.xticks(fontsize=16) pylab.yticks(fontsize=16) pylab.title('Filtered item %d, %s' % (i, labels[i])) if not os.path.exists('filtered'): os.mkdir('filtered') pylab.savefig(os.path.join('filtered', '%s-filtered-%d.pdf' % i)) ''' break else: # keep going # Update this_data to ignore previously considered wavelengths this_data = data[waves,i] peak_ind = this_data.argmax() # Remove all filtered items newdata = np.array([data[:,i] for i in range(data.shape[1]) \ if i not in remove_ind]).T newlabels = np.array([labels[i] for i in range(len(labels)) \ if i not in remove_ind]) printt(" ... from %d to %d items (%d removed)." % (n, newdata.shape[1], n-newdata.shape[1])) n = newdata.shape[1] printt("Filtering out low-SNR data.") # Filter out any item left that has a max peak value < 0.01. # (these are normalized probabilities now) remove_ind = [] for i in range(n): if max(newdata[:,i]) < 0.01: remove_ind +=[i] # Remove all filtered items newdata = np.array([newdata[:,i] for i in range(newdata.shape[1]) \ if i not in remove_ind]).T newlabels = np.array([newlabels[i] for i in range(len(newlabels)) \ if i not in remove_ind]) print " ... from %d to %d items (%d removed)." % (n, newdata.shape[1], n-newdata.shape[1]) #sys.exit(0) return (newdata, newlabels)
def read_from_scratch(self, filename, shotnoisefilt=0, fwfile=''): """read_from_scratch() Read in ENVI (hyperspectral) data from filename. Assume header file is filename.hdr. Optionally, specify the width of a median filter to apply. Optionally, specify a file containing per-feature weights. Strongly inspired by enviread.m from Ian Howat, [email protected]. See ROI_utils.py for full development and testing. """ envi_file = filename # Read in the header file. Try a few options to find the .hdr file. hdrfilenames = [envi_file + '.hdr', envi_file[0:envi_file.rfind('.IMG')] + '.hdr', envi_file[0:envi_file.rfind('.img')] + '.hdr'] for hdrfile in hdrfilenames: if os.path.exists(hdrfile): break info = ENVIData.read_envihdr(hdrfile) self.lines = info['lines'] self.samples = info['samples'] print '%d lines, %d samples, %d bands.' % (self.lines, self.samples, info['bands']) # Set binary format parameters byte_order = info['byte order'] if (byte_order == 0): machine = 'ieee-le' elif (byte_order == 1): machine = 'ieee-be' else: machine = 'n' dtype = info['data type'] if (dtype == 1): format = 'uint8' elif (dtype == 2): format = 'int16' elif (dtype == 3): format = 'int32' elif (dtype == 4): format = 'float32' elif (dtype == 5): format = 'float64' # Note: 'float' is the same as 'float64' elif (dtype == 6): print ':: Sorry, Complex (2x32 bits) data currently not supported.' print ':: Importing as double-precision instead.' format = 'float64' elif (dtype == 9): print ':: Sorry, double-precision complex (2x64 bits) data currently not supported.' return elif (dtype == 12): format = 'uint16' elif (dtype == 13): format = 'uint32' elif (dtype == 14): format = 'int64' elif (dtype == 15): format = 'uint64' else: print 'Error: File type number: %d not supported' % dtype return None print 'Reading data format %s' % format # Read in the data try: dfile = open(envi_file, 'r') except IOError: print ":: Error: data file '%s' not found." % envi_file return None self.data = np.zeros((info['bands'], info['lines'] * info['samples']), format) raw_data = np.fromfile(dfile, format, -1) dfile.close() band_format = info['interleave'].lower() if (band_format == 'bsq'): print "Reading BSQ: Band, Row, Col; %s" % machine raw_data = raw_data.reshape((info['bands'],info['lines'],info['samples'])) for b in range(info['bands']): for i in range(info['lines'] * info['samples']): l = int(i / info['samples']) s = i % info['samples'] self.data[b,i] = raw_data[b,l,s] elif (band_format == 'bil'): print "Reading BIL: Row, Band, Col; %s" % machine raw_data = raw_data.reshape((info['lines'],info['bands'],info['samples'])) for b in range(info['bands']): for i in range(info['lines'] * info['samples']): l = int(i / info['samples']) s = i % info['samples'] self.data[b,i] = raw_data[l,b,s] elif (band_format == 'bip'): print "Reading BIP: Row, Col, Band; %s" % machine raw_data = raw_data.reshape((info['lines'],info['samples'],info['bands'])) for b in range(info['bands']): self.data[b,:] = raw_data[:,:,b].reshape(-1) # Determine whether we need to swap byte order little_endian = (struct.pack('=f', 2.3) == struct.pack('<f', 2.3)) if ( little_endian and machine == 'ieee-be') or \ (not little_endian and machine == 'ieee-le'): self.data.byteswap(True) self.xlabel = 'Wavelength (nm)' self.xvals = info['wavelength'] self.ylabel = 'Reflectance' # Let labels be x,y coordinates self.labels = [] for l in range(info['lines']): for s in range(info['samples']): self.labels += ['%d,%d' % (l,s)] # Data pre-processing (UCIS specific) #if 'UCIS' in envi_file or 'ucis' in envi_file: if 'mars_yard' in envi_file: printt('Filtering out water absorption and known noisy bands,') printt(' from %d' % len(self.xvals)) # Water: 1.38 and 1.87 nm # Also prune out the first 10 and last 3 bands waves_use = [w for w in self.xvals if ((w > 480 and w < 1330) or (w > 1400 and w < 1800) or (w > 1900 and w < 2471))] bands_use = [np.where(self.xvals == w)[0][0] for w in waves_use] self.data = self.data[bands_use, :] self.xvals = self.xvals[bands_use] printt(' to %d bands.' % len(self.xvals)) # Filter out shot noise (median filter) # warning: this is slow... (should be optimized?) from demud import read_feature_weights if shotnoisefilt >= 3: # Read in feature weights, if needed fw = read_feature_weights(fwfile, self.xvals) self.data = LIBSData.medfilter(self.data, shotnoisefilt, fw) # Store the RGB data for later use self.rgb_data = self.get_RGB()
def readin(self, startsol=-1, endsol=-1, initpriorsols=False, shotnoisefilt=0): """readin() Read in LIBS data from self.filename. Read in initialization data from self.initfilename. Normalize according to Nina's instructions. Optionally, specify a sol range (startsol-endsol) for data to analyze. Optionally, use data prior to startsol to initialize the model. Optionally, specify the width of a median filter to apply. """ input_type = os.path.splitext(self.filename)[1][1:] self.data = [] self.initdata = [] self.xlabel = 'Wavelength (nm)' self.ylabel = 'Intensity' if input_type == 'csv': (self.data, self.labels) = LIBSData.read_csv_data(self.filename) # Prune off first column (wavelengths) wavelengths = self.data[:, 0] self.xvals = wavelengths.reshape(-1, 1) self.data = self.data[:, 1:] # features x samples (self.data, self.xvals) = \ LIBSData.prune_and_normalize(self.data, self.xvals, shotnoisefilt) (self.data, self.labels) = self.filter_data(self.data, self.labels) elif input_type == 'pkl': inf = open(self.filename, 'r') (self.data, self.labels, self.xvals) = pickle.load(inf) inf.close() # Temporary: until I re-run full extraction on shiva use = np.where(np.logical_and(self.xvals >= 270, self.xvals < 820))[0] self.xvals = self.xvals[use] self.data = self.data[use, :] (self.data, self.labels) = self.filter_data(self.data, self.labels) else: # Unknown format printt(' Error: Unknown input type for %s; no data read in' % \ self.filename) # Read in the init data file, if present if self.initfilename != '': printt('Reading initialization data set from %s' % self.initfilename) (self.initdata, unused_labels) = LIBSData.read_csv_data(self.initfilename) # Prune off first column (wavelengths) wavelengths = self.initdata[:, 0] self.initdata = self.initdata[:, 1:] # features x samples (self.initdata, unused_xvals) = \ LIBSData.prune_and_normalize(self.initdata, wavelengths, shotnoisefilt) print self.initdata.shape (self.initdata, unused_labels) = self.filter_data(self.initdata, unused_labels) print self.initdata.shape ########## Subselect by sol, if specified ########## if startsol > -1 and endsol >= -1: printt("Analyzing data from sols %d-%d only." % (startsol, endsol)) current_sols = [i for (i,s) in enumerate(self.labels) \ if (int(s.split('_')[0][3:]) >= startsol and \ int(s.split('_')[0][3:]) <= endsol)] if initpriorsols: previous_sols = [i for (i,s) in enumerate(self.labels) \ if int(s.split('_')[0][3:]) < startsol] printt( "Putting previous sols' (before %d) data in initialization model." % startsol) # Concatenate initdata with data from all previous sols if self.initdata != []: print self.initdata.shape print self.data[:, previous_sols].shape self.initdata = np.hstack( (self.initdata, self.data[:, previous_sols])) else: self.initdata = self.data[:, previous_sols] # Prune analysis data set to only include data from the sol of interest self.data = self.data[:, current_sols] self.labels = self.labels[current_sols]
def plot_item(self, m, ind, x, r, k, label, U, rerr, feature_weights): """plot_item(self, m, ind, x, r, k, label, U, rerr, feature_weights) Plot selection m (index ind, data in x) and its reconstruction r, with k and label to annotate of the plot. Use fancy ChemCam elemental annotations. If feature_weights are specified, omit any 0-weighted features from the plot. """ if x == [] or r == []: print "Error: No data in x and/or r." return # Select the features to plot if feature_weights != []: goodfeat = [f for f in range(len(feature_weights)) \ if feature_weights[f] > 0] else: goodfeat = range(len(self.xvals)) pylab.clf() # xvals, x, and r need to be column vectors pylab.plot(self.xvals[goodfeat], r[goodfeat], 'r-', linewidth=0.5) pylab.plot(self.xvals[goodfeat], x[goodfeat], 'k-', linewidth=1) # Boost font sizes for axis and tick labels pylab.xlabel(self.xlabel, fontsize=16) pylab.ylabel(self.ylabel, fontsize=16) pylab.xticks(fontsize=16) pylab.yticks(fontsize=16) pylab.title('DEMUD selection %d (%s), item %d, using K=%d' % \ (m, label, ind, k)) #print 'Reading in emission bands.' # Read in the emission bands emissions = {} with open('LIBS-elts-RCW-NL.txt') as f: for line in f: vals = line.strip().split() if len(vals) < 2: break (wave, elt) = vals emissions[wave] = elt f.close() # Get unique elements elts = list(set(emissions.values())) # Generate per-element colors #colors = ['c','m','b','r','g','b'] colors = [ '#ff0000', '#00ff00', '#0000ff', '#00ffff', '#ff00ff', '#ffff00', '#aa0000', '#00aa00', '#0000aa', '#00aaaa', '#aa00aa', '#aaaa00', '#550000', '#005500', '#000055', '#005555', '#550055', '#555500' ] elt_color = {} for (i, e) in enumerate(elts): elt_color[e] = colors[i % len(colors)] # record the selection number outdir = os.path.join('results', self.name) if not os.path.exists(outdir): os.mkdir(outdir) selfile = os.path.join(outdir, 'sels-%s.txt' % self.name) if m == 0: with open(selfile, 'w') as f: f.write('%d\n' % ind) f.close() else: with open(selfile, 'a') as f: f.write('%d\n' % ind) f.close() res = x - r abs_res = np.absolute(res) mx = abs_res.max() mn = abs_res.min() #printt('Absolute residuals: min %2.g, max %.2g.\n' % (mn, mx)) if mn == mx and mx == 0: return sorted_abs_res = np.sort(abs_res, 0) #frac_annotate = 0.002 frac_annotate = 0.004 width = 8 min_match_nm = 2 num_annotate = int(math.floor(frac_annotate * len(abs_res))) thresh = sorted_abs_res[-num_annotate] #printt('Annotating top %.3f%% of residuals (%d above %.2g).' % \ # (frac_annotate * 100, num_annotate, thresh)) band_ind = (np.where(abs_res >= thresh)[0]).tolist() for band in band_ind: w = float(self.xvals[band]) [b, elt] = LIBSData.find_closest(w, emissions, min_match_nm) reproj = r[band] #printt('%.2f nm (%s): Expected %g, got %g' % (w, elt, reproj, x[band])) if b == -1: b = 1 printt('No match for %.2f nm (%f)' % (w, r[band])) # Draw the triangle using gray, but don't add to legend pylab.fill([w - width, w + width, w], [reproj, reproj, x[band]], '0.6', zorder=1, label='_nolegend_') else: if x[band] > reproj: sn = '+' else: sn = '-' pylab.fill([w - width, w + width, w], [reproj, reproj, x[band]], elt_color[elt], zorder=2, label='%s%s %.2f' % (sn, elt, w)) pylab.legend(fontsize=8) figfile = '%s/%s-sel-%d.pdf' % (outdir, self.name, m) pylab.savefig(figfile) print 'Wrote plot to %s' % figfile # I don't think this works -- hasn't been tested? '''
def readin(self): """readin() Also read in segmentation map from segmapfile and average the data, re-storing it in reduced form in self.data. Set self.labels to record the segment ids. """ super(SegENVIData, self).readin() # data is wavelengths x pixels # Segmentation maps from SLIC are "raster scan, 32-bit float" (per Hua) # Actually, nicer to read them as ints. self.segmap = np.fromfile(self.segmapfile, dtype='int32', count=-1) if self.lines * self.samples != self.segmap.shape[0]: printt('Error: mismatch in number of pixels between image and segmap.') return goodbands = range(len(self.xvals)) # For AVIRIS data: if 'f970619' in self.name: printt('Removing known bad bands, assuming AVIRIS data.') # Per Hua's email of July 3, 2013, use a subset of good bands: # Indexing from 1: [10:100 116:150 180:216] # Subtract 1 to index from 0, but not to the end values # because range() is not inclusive of end goodbands = range(9,100) + range(115,150) + range(179,216) # For UCIS data: elif 'mars_yard' in self.name: printt('Removing known bad bands, assuming UCIS data.') # Per Hua's email of May 8, 2014, use a subset of good bands. # Exclude 1.4-1.9 um (per Diana). waterband_min = np.argmin([abs(x-1400) for x in self.xvals]) waterband_max = np.argmin([abs(x-1900) for x in self.xvals]) waterbands = range(waterband_min, waterband_max+1) # Based on Hua's visual examination, exclude bands # 1-6, 99-105, and 145-155. # Good bands are therefore 7-98, 106-144, and 156-maxband. # Subtract 1 to index from 0, but not to the end values # because range() is not inclusive of end maxband = len(self.xvals) goodbands = range(6,98) + range(105,144) + range(155,maxband) # Remove the water bands printt('Removing water absorption bands.') printt('%d good bands -> ' % len(goodbands)) goodbands = list(set(goodbands) - set(waterbands)) printt(' %d good bands' % len(goodbands)) self.data = self.data[goodbands,:] self.xvals = self.xvals[goodbands] #self.segmap = self.segmap.reshape(self.lines, self.samples) self.labels = np.unique(self.segmap) printt('Found %d segments.' % len(self.labels)) newdata = np.zeros((self.data.shape[0], len(self.labels))) for i, s in enumerate(self.labels): pixels = np.where(self.segmap == s)[0] #print '%d: %s: %d pixels' % (i, str(s), len(pixels)) # Compute and store the mean newdata[:,i] = self.data[:,pixels].mean(1) printt('Finished averaging the spectra.') # Update data with the averaged version self.data = newdata self.name = self.name + '-seg'
def plot_item(self, m, ind, x, r, k, label, U, rerr, feature_weights): """plot_item(self, m, ind, x, r, k, label, U, rerr, feature_weights) Plot selection m (index ind, data in x) and its reconstruction r, with k and label to annotate of the plot. Use fancy ChemCam elemental annotations. If feature_weights are specified, omit any 0-weighted features from the plot. """ if x == [] or r == []: print "Error: No data in x and/or r." return # Select the features to plot if feature_weights != []: goodfeat = [f for f in range(len(feature_weights)) \ if feature_weights[f] > 0] else: goodfeat = range(len(self.xvals)) pylab.clf() # xvals, x, and r need to be column vectors pylab.plot(self.xvals[goodfeat], r[goodfeat], 'r-', linewidth=0.5) pylab.plot(self.xvals[goodfeat], x[goodfeat], 'k-', linewidth=1) # Boost font sizes for axis and tick labels pylab.xlabel(self.xlabel, fontsize=16) pylab.ylabel(self.ylabel, fontsize=16) pylab.xticks(fontsize=16) pylab.yticks(fontsize=16) pylab.title('DEMUD selection %d (%s), item %d, using K=%d' % \ (m, label, ind, k)) #print 'Reading in emission bands.' # Read in the emission bands emissions = {} with open('LIBS-elts-RCW-NL.txt') as f: for line in f: vals = line.strip().split() if len(vals) < 2: break (wave, elt) = vals emissions[wave] = elt f.close() # Get unique elements elts = list(set(emissions.values())) # Generate per-element colors #colors = ['c','m','b','r','g','b'] colors = ['#ff0000', '#00ff00', '#0000ff', '#00ffff', '#ff00ff', '#ffff00', '#aa0000', '#00aa00', '#0000aa', '#00aaaa', '#aa00aa', '#aaaa00', '#550000', '#005500', '#000055', '#005555', '#550055', '#555500'] elt_color = {} for (i,e) in enumerate(elts): elt_color[e] = colors[i % len(colors)] # record the selection number outdir = os.path.join('results', self.name) if not os.path.exists(outdir): os.mkdir(outdir) selfile = os.path.join(outdir, 'sels-%s.txt' % self.name) if m == 0: with open(selfile, 'w') as f: f.write('%d\n' % ind) f.close() else: with open(selfile, 'a') as f: f.write('%d\n' % ind) f.close() res = x - r abs_res = np.absolute(res) mx = abs_res.max() mn = abs_res.min() #printt('Absolute residuals: min %2.g, max %.2g.\n' % (mn, mx)) if mn == mx and mx == 0: return sorted_abs_res = np.sort(abs_res,0) #frac_annotate = 0.002 frac_annotate = 0.004 width = 8 min_match_nm = 2 num_annotate = int(math.floor(frac_annotate * len(abs_res))) thresh = sorted_abs_res[-num_annotate] #printt('Annotating top %.3f%% of residuals (%d above %.2g).' % \ # (frac_annotate * 100, num_annotate, thresh)) band_ind = (np.where(abs_res >= thresh)[0]).tolist() for band in band_ind: w = float(self.xvals[band]) [b, elt] = LIBSData.find_closest(w, emissions, min_match_nm) reproj = r[band] #printt('%.2f nm (%s): Expected %g, got %g' % (w, elt, reproj, x[band])) if b == -1: b = 1 printt('No match for %.2f nm (%f)' % (w, r[band])) # Draw the triangle using gray, but don't add to legend pylab.fill([w-width, w+width, w], [reproj, reproj, x[band]], '0.6', zorder=1, label='_nolegend_') else: if x[band] > reproj: sn = '+' else: sn = '-' pylab.fill([w-width, w+width, w], [reproj, reproj, x[band]], elt_color[elt], zorder=2, label='%s%s %.2f' % (sn, elt, w)) pylab.legend(fontsize=8) figfile = '%s/%s-sel-%d.pdf' % (outdir, self.name, m) pylab.savefig(figfile) print 'Wrote plot to %s' % figfile # I don't think this works -- hasn't been tested? '''
def read_dir(cls, dirname, outfile, shotnoisefilt=0): """read_dir(dirname, outfile) Read in raw LIBS data from .csv files in dirname. Pickle the result and save it to outfile. Note: does NOT update object fields. Follow this with a call to readin(). """ # First read in the target names and sol numbers. targets = {} sols = {} # Location of this file is hard-coded! # My latest version goes through sol 707. metafile = 'msl_ccam_obs.csv' with open(os.path.join(dirname, metafile)) as f: datareader = csv.reader(f) # Index targets, sols by spacecraft clock value for row in datareader: [sol, edr_type, sclk, target] = [row[i] for i in [0, 1, 2, 5]] if edr_type != 'CL5': continue prior_targets = [t for t in targets.values() if target in t] n_prior = len(prior_targets) # Add 1 so shots are indexed from 1, not 0 targets[sclk] = target + ':%d' % (n_prior + 1) sols[sclk] = sol print 'Read %d target names from %s.' % (len(targets), metafile) print 'Now reading LIBS data from %s.' % dirname data = [] labels = [] wavelengths = [] files = os.listdir(dirname) f_ind = 0 # for f in files[:len(files)]: # Select only CSV files # if fnmatch.fnmatch(f, 'CL5_*.csv'): for f in fnmatch.filter(files, 'CL5_*.csv') + \ fnmatch.filter(files, 'cl5_*.csv'): # Extract site_drive_seqid from the filename filename = f[f.rfind('/') + 1:] printt(' Processing %s.' % filename) sclk = filename[4:13] site = filename[18:21] drive = filename[21:25] seqid = filename[29:34] target = targets[sclk] sol = sols[sclk] # If it's a cal target, skip it if 'Cal Target' in target: print 'Skipping %s' % target continue #site_drive_seqid_target = '%s_%s_%s_%s' % (site, drive, seqid, target) #drive_sclk_target = '%s_%s_%s' % (drive, sclk, target) sol_sclk_target = 'Sol%s_%s_%s' % (sol, sclk, target) print(' Spacecraft clock %s, identifier %s.' % \ (sclk, sol_sclk_target)) with open(os.path.join(dirname, f), 'r') as csvfile: datareader = csv.reader(csvfile) row = datareader.next() while row[0][0] == '#': # Save the last row (comment line) lastrow = row row = datareader.next() # The last comment line contains the header strings # starting with 'wave' or 'nm' mylabels = [l.strip() for l in lastrow] mydata = [[float(x) for x in row]] for row in datareader: # Skip over empty lines if row[0] == '': continue mydata += [[float(x) for x in row]] mydata = np.array(mydata) # Store the wavelengths waveind = [ind for ind,name in enumerate(mylabels) \ if 'wave' in name] if len(waveind) != 1: printt('Expected 1 match on "wave"; got %d.' % len(waveind)) sys.exit(1) mywaves = mydata[:, waveind[0]] # Keep only the shots #shots = [ind for ind,name in enumerate(mylabels) \ # if 'shot' in name] # Keep only the mean shots = [ind for ind,name in enumerate(mylabels) \ if 'mean' in name] #myshotnames = ['%s_%d_%s' % (site_drive_sclk_target, # f_ind, mylabels[shot]) myshotnames = [ '%s_%s' % (sol_sclk_target, mylabels[shot]) for shot in shots ] mydata = mydata[:, [l for l in shots]] printt(' Read %d new items, %d features.' % mydata.shape[::-1]) if wavelengths != [] and np.any(wavelengths != mywaves): printt( 'Error: wavelengths in file %d do not match previous.' % f_ind) if f_ind == 0: data = mydata wavelengths = mywaves else: data = np.concatenate((data, mydata), 1) labels += myshotnames f_ind = f_ind + 1 printt('Total so far: %d items, %d files.' % (data.shape[1], f_ind)) print if data == []: printt('No data files found, exiting.') sys.exit() printt('Read a total of %d items, %d features.' % data.shape[::-1]) labels = np.array(labels) # Prune and normalize (data, wavelengths) = LIBSData.prune_and_normalize(data, wavelengths, shotnoisefilt) printt('Saving to %s.' % outfile) outf = open(outfile, 'w') pickle.dump((data, labels, wavelengths), outf) outf.close() print 'Done!'
def plot_item(self, m, ind, x, r, k, label, U=[], scores=[], feature_weights=[]): """plot_item(self, m, ind, x, r, k, label, U, scores, feature_weights) Plot selection m (index ind, data in x) and its reconstruction r, with k and label to annotate of the plot. Also show a spatial plot indicating where the selected pixel is and an abundance plot of similarity across the data set. U and scores are optional; ignored in this method, used in some classes' submethods. If feature_weights are specified, omit any 0-weighted features from the plot. """ if x == [] or r == []: printt("Error: No data in x and/or r.") return (l,s) = [int(v) for v in label.split(',')] # Select the features to plot if feature_weights != []: goodfeat = [f for f in range(len(feature_weights)) \ if feature_weights[f] > 0] else: goodfeat = range(len(self.xvals)) # Set up the subplots pylab.figure() #pylab.subplots_adjust(wspace=0.1, left=0) pylab.subplots_adjust(wspace=0.05) # Plot #1: expected vs. observed feature vectors # xvals, x, and r need to be column vectors pylab.subplot(2,2,1) pylab.plot(self.xvals[goodfeat], r[goodfeat], 'r-', label='Expected') pylab.plot(self.xvals[goodfeat], x[goodfeat], 'b.-', label='Observations') pylab.ylim([0.0, max(1.0, x.max())]) pylab.xlabel(self.xlabel) pylab.ylabel(self.ylabel) pylab.legend(fontsize=10, loc=2) # Plot #2: zoom of selected pixel, 20x20 context pylab.subplot(2,2,2) winwidth = 20 minl = max(0, l-winwidth/2) mins = max(0, s-winwidth/2) maxl = min(self.lines, l+winwidth/2) maxs = min(self.samples, s+winwidth/2) rgb_data = self.get_RGB() pylab.imshow(rgb_data[minl:maxl, mins:maxs], interpolation='none') #, alpha=0.85) pylab.gca().add_patch(Rectangle((min(winwidth/2,s)-1, min(winwidth/2,l)-1), 2, 2, fill=None, alpha=1)) pylab.axis('off') pylab.title('Zoom') # Spatial selection plot # this is an inset axes over the main axes #a = pylab.axes([.15, .75, .3, .15]) pylab.subplot(2,2,3) # Use alpha to lighten the RGB data plt = pylab.imshow(rgb_data, interpolation='none', alpha=0.85) pylab.plot(s, l, 'x', markeredgewidth=2, scalex=False, scaley=False) #pylab.setp(a, xticks=[], yticks=[]) pylab.axis('off') pylab.title('Selection') # Also update the priority map. self.pr_map[l,s] = m+1 #print 'setting %d, %d to %d' % (l, s, -m) n_tot = self.lines * self.samples n_pri = len(self.pr_map.nonzero()[0]) n_unp = n_tot - n_pri printt(' %d prioritized; %d (%.2f%%) unprioritized remain' % \ (n_pri, n_unp, n_unp * 100.0 / n_tot)) # Abundance map # Compute distance from selected x to all other items abund = np.zeros((self.lines, self.samples)) nbands = self.data.shape[0] for l_ind in range(self.lines): for s_ind in range(self.samples): if l_ind == l and s_ind == s: abund[l_ind,s_ind] = 0 continue d = self.data[:, l_ind*self.samples + s_ind] # Use Euclidean distance. #abund[l,s] = math.sqrt(pow(np.sum(x - d), 2)) / float(nbands) # Use spectral angle distance num = np.dot(x, d) denom = np.linalg.norm(x) * np.linalg.norm(d) if num > denom: # ensure math.acos() doesn't freak out; clip to 1.0 num = denom abund[l_ind,s_ind] = math.acos(num / denom) # Propagate current priority to similar items (not yet prioritized) # This threshold is subjectively chosen. # I used 0.10 for the Mars yard UCIS cube from Diana. # I used different values for the micro-UCIS cubes from Bethany # (see Evernote notes). # UCIS if self.pr_map[l_ind,s_ind] == 0 and abund[l_ind,s_ind] <= 0.10: # micro-UCIS #if self.pr_map[l_ind,s_ind] == 0 and abund[l_ind,s_ind] <= 0.13: self.pr_map[l_ind,s_ind] = m+1 printt('Abundance: ', abund.min(), abund.max()) pylab.subplot(2,2,4) # Use colormap jet_r so smallest value is red and largest is blue pylab.imshow(abund, interpolation='none', cmap='jet_r', vmin=0, vmax=0.15) pylab.axis('off') pylab.title('Abundance') pylab.suptitle('DEMUD selection %d (%s), item %d, using K=%d' % \ (m, label, ind, k)) # Write the plot to a file. outdir = os.path.join('results', self.name) if not os.path.exists(outdir): os.mkdir(outdir) figfile = os.path.join(outdir, 'sel-%d-k-%d-(%s).pdf' % (m, k, label)) pylab.savefig(figfile) print 'Wrote plot to %s' % figfile pylab.close() # Write the priority map to an image file pylab.figure() # Start with colormap jet_r so smallest value is red and largest is blue # Max_c must be at least 2 and no greater than 255. # Values greater than 255 will be mapped to the last color. # (Imposed because we're then saving this out as an ENVI classification map with bytes. # May want to be more flexible in the future, but I can't imagine really wanting to see # more than 255 distinct colors?) max_c = 255 if m > 254 else m+2 max_c = 2 if max_c < 2 else max_c cmap = matplotlib.cm.get_cmap('jet_r', max_c) # Tweak so 0 is white; red starts at 1 jet_map_v = cmap(np.arange(max_c)) #jet_map_v[0] = [1,1,1,1] # white cmap = matplotlib.colors.LinearSegmentedColormap.from_list("jet_map_white", jet_map_v) pr_map_plot = np.copy(self.pr_map) # Set unprioritized items to one shade darker than most recent pr_map_plot[pr_map_plot == 0] = m+2 #pylab.imshow(pr_map_plot, interpolation='none', cmap=cmap, vmin=1, vmax=m+1) pylab.imshow(pr_map_plot, interpolation='none', cmap=cmap) prmapfig = os.path.join(outdir, 'prmap-k-%d.pdf' % k) pylab.savefig(prmapfig) if (m % 10) == 0: prmapfig = os.path.join(outdir, 'prmap-k-%d-m-%d.pdf' % (k, m)) pylab.savefig(prmapfig) print 'Wrote priority map figure to %s (max_c %d)' % (prmapfig, max_c) pylab.close() # Write the priority map contents to a file as a 64-bit float map # (retained for backward compatibility for Hua, # but superseded by ENVI data file next) prmapfile = os.path.join(outdir, 'prmap-k-%d-hua.dat' % k) fid = open(prmapfile, 'wb') self.pr_map.astype('float64').tofile(fid) fid.close() print "Wrote Hua's priority map (data) to %s" % prmapfile # Write an ENVI data file and header file prmapfile = os.path.join(outdir, 'prmap-k-%d.dat' % k) fid = open(prmapfile, 'wb') self.pr_map.astype('uint8').tofile(fid) # save out as bytes # This is a class map header file prmaphdr = os.path.join(outdir, 'prmap-k-%d.dat.hdr' % k) fid = open(prmaphdr, 'w') fid.write('ENVI\n') fid.write('description = { DEMUD prioritization map }\n') fid.write('samples = %d\n' % self.samples) fid.write('lines = %d\n' % self.lines) fid.write('bands = 1\n') fid.write('header offset = 0\n') # 0 bytes fid.write('file type = Classification\n') fid.write('data type = 1\n') # byte (max 255 priorities) fid.write('interleave = bip\n') # Irrelevant for single 'band' fid.write('byte order = 0\n') # Least-significant byte first fid.write('classes = %d\n' % k) # Number of classes # Classes include None (0) and then integers up to number of classes. fid.write("class names = {'None', " + ', '.join(["'%d'" % a for a in range(1, max_c)]) + '}\n') fid.write('class lookup = {' + ',\n '.join([' %d, %d, %d' % (r*255,g*255,b*255) for (r,g,b,a) in jet_map_v]) + ' }\n') fid.close() print 'Wrote ENVI data/header to priority map figure to %s[.hdr]' % prmapfile # Write the selections (spectra) in ASCII format selfile = os.path.join(outdir, 'selections-k%d.txt' % k) # If this is the first selection, open for write # to clear out previous run. if m == 0: fid = open(selfile, 'w') # Output a header fid.write('# Index, Score') for w in self.xvals.tolist(): fid.write(', %.3f' % w) fid.write('\n') # If scores is empty, the (first) selection was pre-specified, # so there are no scores. Output 0 for this item. if scores == []: fid.write('%d,0.0,' % (m)) else: fid = open(selfile, 'a') fid.write('%d,%f,' % (m, scores[m])) # Now output the feature vector itself # Have to reshape x because it's a 1D column vector np.savetxt(fid, x.reshape(1, x.shape[0]), fmt='%.5f', delimiter=',') fid.close()
def filter_data(self, data, labels): """filter_data(data, labels) Filter out bad quality data, using criteria provided by Nina Lanza: 1) Large, broad features (don't correspond to narrow peaks) 2) Low SNR For each item thus filtered, write out a plot of the data with an explanation: 1) Annotate in red the large, broad feature, or 2) Annotate in text the SNR. Returns updated (filtered) data and label arrays. """ n = data.shape[1] newdata = data remove_ind = [] printt("Filtering out data with large, broad features.") #for i in [78]: # test data gap #for i in [1461]: # test broad feature #for i in [3400]: # test broad feature for i in range(n): waves = range(data.shape[0]) this_data = data[waves, i] peak_ind = this_data.argmax() peak_wave = self.xvals[waves[peak_ind]] # Set min peak to examine as 30% of max min_peak = 0.15 * this_data[peak_ind] # Track red_waves: indices of bands that contribute to deciding # to filter out this item (if indeed it is). # These same wavelengths will be removed from further consideration # regardless of filtering decision red_waves = [] # Iterate over peaks sufficiently big to be of interest while this_data[peak_ind] >= min_peak: #print "%d) Max peak: %f nm (index %d, %f)" % (i, # self.xvals[waves[peak_ind]], # peak_ind, # this_data[peak_ind]) red_waves = [waves[peak_ind]] # Set the low value to look for (indicates nice narrow peak) low_value = 0.1 * this_data[peak_ind] filter_item = True # guilty until proven innocent # Note: band resolution/spacing is not the same for diff ranges? # Sweep left and right up to 400 bands (10 nm), looking for low_value min_wave_ind = peak_ind max_wave_ind = peak_ind for j in range(1, 401): min_wave_ind = max(min_wave_ind - 1, 0) max_wave_ind = min(max_wave_ind + 1, len(waves) - 1) red_waves += [waves[min_wave_ind]] red_waves += [waves[max_wave_ind]] # If there's a data gap, ignore it if ((self.xvals[waves[min_wave_ind] + 1] - self.xvals[waves[min_wave_ind]]) > 1): min_wave_ind += 1 if ((self.xvals[waves[max_wave_ind]] - self.xvals[waves[max_wave_ind] - 1]) > 1): max_wave_ind -= 1 # Stop if we've gone more than 10 nm if (((self.xvals[waves[peak_ind]] - self.xvals[waves[min_wave_ind]]) > 10) or ((self.xvals[waves[max_wave_ind]] - self.xvals[waves[peak_ind]]) > 10)): filter_item = True #print '%.2f: %.2f to %.2f' % (self.xvals[waves[peak_ind]], # self.xvals[waves[min_wave_ind]], # self.xvals[waves[max_wave_ind]]) break #print 'checking %f, %f' % (self.xvals[waves[min_wave_ind]], # self.xvals[waves[max_wave_ind]]) if this_data[min_wave_ind] <= low_value or \ this_data[max_wave_ind] <= low_value: # success! data is good #print ' %f: %f' % (self.xvals[waves[min_wave_ind]], # this_data[min_wave_ind]) #print ' %f: %f' % (self.xvals[waves[max_wave_ind]], # this_data[max_wave_ind]) filter_item = False break # Remove the wavelengths we've considered [waves.remove(w) for w in red_waves if w in waves] # Filter the item out if filter_item: print "Filter item %d (%s) due to [%.2f, %.2f] nm " % ( i, labels[i], self.xvals[min(red_waves)], self.xvals[max(red_waves)]) # record it for later removal remove_ind += [i] ''' # generate a plot, highlighting the problematic feature in red_waves pylab.clf() pylab.plot(self.xvals, data[:,i], 'k-', linewidth=1) pylab.plot(self.xvals[min(red_waves):max(red_waves)+1], data[min(red_waves):max(red_waves)+1,i], 'r-', linewidth=1) pylab.xlabel(self.xlabel, fontsize=16) pylab.ylabel(self.ylabel, fontsize=16) pylab.xticks(fontsize=16) pylab.yticks(fontsize=16) pylab.title('Filtered item %d, %s' % (i, labels[i])) if not os.path.exists('filtered'): os.mkdir('filtered') pylab.savefig(os.path.join('filtered', '%s-filtered-%d.pdf' % i)) ''' break else: # keep going # Update this_data to ignore previously considered wavelengths this_data = data[waves, i] peak_ind = this_data.argmax() # Remove all filtered items newdata = np.array([data[:,i] for i in range(data.shape[1]) \ if i not in remove_ind]).T newlabels = np.array([labels[i] for i in range(len(labels)) \ if i not in remove_ind]) printt(" ... from %d to %d items (%d removed)." % (n, newdata.shape[1], n - newdata.shape[1])) n = newdata.shape[1] printt("Filtering out low-SNR data.") # Filter out any item left that has a max peak value < 0.01. # (these are normalized probabilities now) remove_ind = [] for i in range(n): if max(newdata[:, i]) < 0.01: remove_ind += [i] # Remove all filtered items newdata = np.array([newdata[:,i] for i in range(newdata.shape[1]) \ if i not in remove_ind]).T newlabels = np.array([newlabels[i] for i in range(len(newlabels)) \ if i not in remove_ind]) print " ... from %d to %d items (%d removed)." % (n, newdata.shape[1], n - newdata.shape[1]) #sys.exit(0) return (newdata, newlabels)
def readin(self, startsol=-1, endsol=-1, initpriorsols=False, shotnoisefilt=0): """readin() Read in LIBS data from self.filename. Read in initialization data from self.initfilename. Normalize according to Nina's instructions. Optionally, specify a sol range (startsol-endsol) for data to analyze. Optionally, use data prior to startsol to initialize the model. Optionally, specify the width of a median filter to apply. """ input_type = os.path.splitext(self.filename)[1][1:] self.data = [] self.initdata = [] self.xlabel = 'Wavelength (nm)' self.ylabel = 'Intensity' if input_type == 'csv': (self.data, self.labels) = LIBSData.read_csv_data(self.filename) # Prune off first column (wavelengths) wavelengths = self.data[:,0] self.xvals = wavelengths.reshape(-1,1) self.data = self.data[:,1:] # features x samples (self.data, self.xvals) = \ LIBSData.prune_and_normalize(self.data, self.xvals, shotnoisefilt) (self.data, self.labels) = self.filter_data(self.data, self.labels) elif input_type == 'pkl': inf = open(self.filename, 'r') (self.data, self.labels, self.xvals) = pickle.load(inf) inf.close() # Temporary: until I re-run full extraction on shiva use = np.where(np.logical_and(self.xvals >= 270, self.xvals < 820))[0] self.xvals = self.xvals[use] self.data = self.data[use,:] (self.data, self.labels) = self.filter_data(self.data, self.labels) else: # Unknown format printt(' Error: Unknown input type for %s; no data read in' % \ self.filename) # Read in the init data file, if present if self.initfilename != '': printt('Reading initialization data set from %s' % self.initfilename) (self.initdata, unused_labels) = LIBSData.read_csv_data(self.initfilename) # Prune off first column (wavelengths) wavelengths = self.initdata[:,0] self.initdata = self.initdata[:,1:] # features x samples (self.initdata, unused_xvals) = \ LIBSData.prune_and_normalize(self.initdata, wavelengths, shotnoisefilt) print self.initdata.shape (self.initdata, unused_labels) = self.filter_data(self.initdata, unused_labels) print self.initdata.shape ########## Subselect by sol, if specified ########## if startsol > -1 and endsol >=-1: printt("Analyzing data from sols %d-%d only." % (startsol, endsol)) current_sols = [i for (i,s) in enumerate(self.labels) \ if (int(s.split('_')[0][3:]) >= startsol and \ int(s.split('_')[0][3:]) <= endsol)] if initpriorsols: previous_sols = [i for (i,s) in enumerate(self.labels) \ if int(s.split('_')[0][3:]) < startsol] printt("Putting previous sols' (before %d) data in initialization model." % startsol) # Concatenate initdata with data from all previous sols if self.initdata != []: print self.initdata.shape print self.data[:,previous_sols].shape self.initdata = np.hstack((self.initdata, self.data[:,previous_sols])) else: self.initdata = self.data[:,previous_sols] # Prune analysis data set to only include data from the sol of interest self.data = self.data[:,current_sols] self.labels = self.labels[current_sols]
def plot_item(self, m, ind, x, r, k, label, U=[], scores=[], feature_weights=[]): """plot_item(self, m, ind, x, r, k, label, U, scores, feature_weights) Plot selection m (index ind, data in x) and its reconstruction r, with k and label to annotate of the plot. Also show a spatial plot indicating where the selected pixel is and an abundance plot of similarity across the data set. U and scores are optional; ignored in this method, used in some classes' submethods. If feature_weights are specified, omit any 0-weighted features from the plot. """ if x == [] or r == []: printt("Error: No data in x and/or r.") return (l,s) = [int(v) for v in label.split(',')] # Select the features to plot if feature_weights != []: goodfeat = [f for f in range(len(feature_weights)) \ if feature_weights[f] > 0] else: goodfeat = range(len(self.xvals)) # Set up the subplots pylab.figure() #pylab.subplots_adjust(wspace=0.1, left=0) pylab.subplots_adjust(wspace=0.05) # Plot #1: expected vs. observed feature vectors # xvals, x, and r need to be column vectors pylab.subplot(2,2,1) pylab.plot(self.xvals[goodfeat], r[goodfeat], 'r-', label='Expected') pylab.plot(self.xvals[goodfeat], x[goodfeat], 'b.-', label='Observations') pylab.ylim([0.0, max(1.0, x.max())]) pylab.xlabel(self.xlabel) pylab.ylabel(self.ylabel) pylab.legend(fontsize=10, loc=2) # Plot #2: zoom of selected pixel, 20x20 context pylab.subplot(2,2,2) winwidth = 20 minl = max(0, l-winwidth/2) mins = max(0, s-winwidth/2) maxl = min(self.lines, l+winwidth/2) maxs = min(self.samples, s+winwidth/2) rgb_data = self.get_RGB() pylab.imshow(rgb_data[minl:maxl, mins:maxs], interpolation='none') #, alpha=0.85) pylab.gca().add_patch(Rectangle((min(winwidth/2,s)-1, min(winwidth/2,l)-1), 2, 2, fill=None, alpha=1)) pylab.axis('off') pylab.title('Zoom') # Spatial selection plot # this is an inset axes over the main axes #a = pylab.axes([.15, .75, .3, .15]) pylab.subplot(2,2,3) # Use alpha to lighten the RGB data plt = pylab.imshow(rgb_data, interpolation='none', alpha=0.85) pylab.plot(s, l, 'x', markeredgewidth=2, scalex=False, scaley=False) #pylab.setp(a, xticks=[], yticks=[]) pylab.axis('off') pylab.title('Selection') # Also update the priority map. self.pr_map[l,s] = m+1 #print 'setting %d, %d to %d' % (l, s, -m) n_tot = self.lines * self.samples n_pri = len(self.pr_map.nonzero()[0]) n_unp = n_tot - n_pri printt(' %d prioritized; %d (%.2f%%) unprioritized remain' % \ (n_pri, n_unp, n_unp * 100.0 / n_tot)) # Abundance map # Compute distance from selected x to all other items abund = np.zeros((self.lines, self.samples)) nbands = self.data.shape[0] for l_ind in range(self.lines): for s_ind in range(self.samples): if l_ind == l and s_ind == s: abund[l_ind,s_ind] = 0 continue d = self.data[:, l_ind*self.samples + s_ind] # Use Euclidean distance. #abund[l,s] = math.sqrt(pow(np.sum(x - d), 2)) / float(nbands) # Use spectral angle distance abund[l_ind,s_ind] = math.acos(np.dot(x, d) / (np.linalg.norm(x) * np.linalg.norm(d))) # Propagate current priority to similar items (not yet prioritized) # This threshold is subjectively chosen. # I used 0.10 for the Mars yard UCIS cube from Diana. # I used different values for the micro-UCIS cubes from Bethany # (see Evernote notes). if self.pr_map[l_ind,s_ind] == 0 and abund[l_ind,s_ind] <= 0.13: self.pr_map[l_ind,s_ind] = m+1 printt('Abundance: ', abund.min(), abund.max()) pylab.subplot(2,2,4) # Use colormap jet_r so smallest value is red and largest is blue pylab.imshow(abund, interpolation='none', cmap='jet_r', vmin=0, vmax=0.15) pylab.axis('off') pylab.title('Abundance') pylab.suptitle('DEMUD selection %d (%s), item %d, using K=%d' % \ (m, label, ind, k)) # Write the plot to a file. outdir = os.path.join('results', self.name) if not os.path.exists(outdir): os.mkdir(outdir) figfile = os.path.join(outdir, 'sel-%d-k-%d-(%s).pdf' % (m, k, label)) pylab.savefig(figfile) print 'Wrote plot to %s' % figfile # Write the priority map to an image file prmapfig = os.path.join(outdir, 'prmap-k-%d.pdf' % k) pylab.figure() # Start with colormap jet_r so smallest value is red and largest is blue # Max_c must be at least 2 and no greater than 255. # Values greater than 255 will be mapped to the last color. # (Imposed because we're then saving this out as an ENVI classification map with bytes. # May want to be more flexible in the future, but I can't imagine really wanting to see # more than 255 distinct colors?) max_c = 255 if m > 255 else m+1 max_c = 2 if max_c < 2 else max_c cmap = matplotlib.cm.get_cmap('jet_r', max_c) # Tweak so 0 is white; red starts at 1 jet_map_v = cmap(np.arange(max_c)) jet_map_v[0] = [1,1,1,1] # white cmap = matplotlib.colors.LinearSegmentedColormap.from_list("jet_map_white", jet_map_v) #pylab.imshow(self.pr_map, interpolation='none', cmap=cmap, vmin=1, vmax=m+1) pylab.imshow(self.pr_map, interpolation='none', cmap=cmap) pylab.savefig(prmapfig) print 'Wrote priority map figure to %s' % prmapfig # Write the priority map contents to a file as a 64-bit float map # (retained for backward compatibility for Hua, # but superseded by ENVI data file next) prmapfile = os.path.join(outdir, 'prmap-k-%d-hua.dat' % k) fid = open(prmapfile, 'wb') self.pr_map.astype('float64').tofile(fid) fid.close() print "Wrote Hua's priority map (data) to %s" % prmapfile # Write an ENVI data file and header file prmapfile = os.path.join(outdir, 'prmap-k-%d.dat' % k) fid = open(prmapfile, 'wb') self.pr_map.astype('uint8').tofile(fid) # save out as bytes # This is a class map header file prmaphdr = os.path.join(outdir, 'prmap-k-%d.dat.hdr' % k) fid = open(prmaphdr, 'w') fid.write('ENVI\n') fid.write('description = { DEMUD prioritization map }\n') fid.write('samples = %d\n' % self.samples) fid.write('lines = %d\n' % self.lines) fid.write('bands = 1\n') fid.write('header offset = 0\n') # 0 bytes fid.write('file type = Classification\n') fid.write('data type = 1\n') # byte (max 255 priorities) fid.write('interleave = bip\n') # Irrelevant for single 'band' fid.write('byte order = 0\n') # Least-significant byte first # Classes include None (0) and then integers up to number of classes. fid.write("class names = {'None', " + ', '.join(["'%d'" % a for a in range(1, max_c)]) + '}\n') fid.write('class lookup = {' + ',\n '.join([' %d, %d, %d' % (r*255,g*255,b*255) for (r,g,b,a) in jet_map_v]) + ' }\n') fid.close() print 'Wrote ENVI data/header to priority map figure to %s[.hdr]' % prmapfile # Write the selections (spectra) in ASCII format selfile = os.path.join(outdir, 'selections-k%d.txt' % k) # If this is the first selection, open for write # to clear out previous run. if m == 0: fid = open(selfile, 'w') # Output a header fid.write('# Index, Score') for w in self.xvals.tolist(): fid.write(', %.3f' % w) fid.write('\n') # If scores is empty, the (first) selection was pre-specified, # so there are no scores. Output 0 for this item. if scores == []: fid.write('%d,0.0,' % (m)) else: fid = open(selfile, 'a') fid.write('%d,%f,' % (m, scores[m])) # Now output the feature vector itself # Have to reshape x because it's a 1D column vector np.savetxt(fid, x.reshape(1, x.shape[0]), fmt='%.5f', delimiter=',') fid.close()