def reconstruct_audio(matches, distances, bounds, target_file, source_files, p, win, hop, beta=2.0): """ make a new audio signal based on matches and source media inputs: matches - list of matches from match_sequences distances - list of distances from match_sequences target_file - file name of target media (the one to reconstruct) source_files - list of file names of source media (the database) p - feature parameters win - sequence length hop - sequence hop beta - stiffness coefficient for mixing based on distances [2.0] returns: y - the reconstructued audio signal """ y_list = list() hamm = hamming(p['nhop']*2)[:p['nhop']] for i in range(len(matches)): x = _fetch_audio(target_file, i, p, win, hop) y = zeros((win*p['nhop'])) for j, m in enumerate(matches[i,:]): yy = _fetch_audio(source_files[_bounds_to_locator(m,bounds)], _bounds_to_index(m,bounds), p, win, hop) y += yy * exp(-beta * distances[i,j]) # weight match contribution by distance prior y *= rms_flat(x) / rms_flat(y) # energy balance output rms using input rms if win>1 and hop<win: y[:p['nhop']]*=hamm y[:-p['nhop']-1:-1]*=hamm y_list.append(y) return _sequence_overlap_add(y_list, p, win, hop)
def verify_formation(self, html_writer, thermodynamics, name=None): cid2errors = defaultdict(list) cid2refs = defaultdict(set) reaction2errors = defaultdict(list) reaction2refs = defaultdict(set) for row_data in self.SelectRowsFromNist(): dG0_est = row_data.PredictReactionEnergy(thermodynamics) if np.isnan(dG0_est): continue err = row_data.dG0_r - dG0_est for cid in row_data.GetAllCids(): cid2errors[cid].append(err) cid2refs[cid].add((row_data.ref_id, row_data.url)) reaction2errors[row_data.reaction].append(err) reaction2refs[row_data.reaction].add((row_data.ref_id, row_data.url)) rowdicts = [] for cid, err_list in cid2errors.iteritems(): refs = cid2refs[cid] urls = ', '.join(['<a href="%s">%s</a>' % (url, ref_id) for ref_id, url in refs]) rowdict = {'cid':'C%05d' % cid, 'name':self.kegg.cid2name(cid), 'RMSE':rms_flat(err_list), 'E[err]':np.mean(err_list), '#err':len(err_list), 'std[err]':np.std(err_list), 'URLs':urls} rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['RMSE'], reverse=True) html_writer.write_table(rowdicts, ['#', 'cid', 'name', 'RMSE', '#err', 'E[err]', 'std[err]', 'URLs'], decimal=1) rowdicts = [] for reaction, err_list in reaction2errors.iteritems(): refs = reaction2refs[reaction] urls = ', '.join(['<a href="%s">%s</a>' % (url, ref_id) for ref_id, url in refs]) rowdict = {'reaction':reaction.to_hypertext(show_cids=False), 'RMSE':rms_flat(err_list), 'E[err]':np.mean(err_list), '#err':len(err_list), 'std[err]':np.std(err_list), 'URLs':urls} rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['RMSE'], reverse=True) html_writer.write_table(rowdicts, ['#', 'reaction', 'RMSE', '#err', 'E[err]', 'std[err]', 'URLs'], decimal=1)
def Report(self, est, title): self.html_writer.write('</br><b>%s</b><br>\n' % title) finite = np.isfinite(est) resid = abs(self.b[finite] - est[finite]) fig = plt.figure(figsize=(5,5), dpi=60) cdf(list(resid.flat), figure=fig) #plt.plot(self.b[finite].T, est[finite].T, '.', figure=fig) plt.title("RMSE = %.1f, N = %d" % (rms_flat(resid.flat), resid.shape[1])) plt.xlabel(r"$|\Delta_r G^{'\circ} obs - \Delta_r G^{'\circ} est|$ [kJ/mol]") plt.ylabel(r"CDF") self.html_writer.embed_matplotlib_figure(fig) rowdicts = [] for i in xrange(self.b.shape[1]): rowdict = {} rowdict['row'] = i rowdict['type'] = self.obs_types[i] rowdict['reaction'] = UnifiedGroupContribution.row2hypertext(self.S[:, i], self.cids) rowdict['anchored'] = self.anchored[0, i] rowdict['obs'] = self.b[0, i] rowdict['est'] = est[0, i] if np.isfinite(est[0, i]): rowdict['|err|'] = abs(self.b[0, i] - est[0, i]) else: rowdict['|err|'] = 0 rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['|err|'], reverse=True) self.html_writer.insert_toggle(start_here=True, label="Show table") self.html_writer.write_table(rowdicts, headers=['row', 'type', 'reaction', 'anchored', 'obs', 'est', '|err|'], decimal=1) self.html_writer.div_end()
def noise_brown(ncols, nrows=1, weight=1, filter=None, filterargs=None): '''Return 1/f^2 noise of shape(nrows, ncols obtained by taking the cumulative sum of gaussian white noise, with rms weight. If filter is not None, this function will apply the filter coefficients obtained by: :: >>> b, a = filter(**filterargs) >>> signal = scipy.signal.lfilter(b, a, signal) ''' from matplotlib.mlab import rms_flat if filter is not None: coeff_b, coeff_a = list(filter(**filterargs)) noise = np.empty((nrows, ncols)) for i in range(nrows): signal = np.random.normal(size=ncols + 10000).cumsum() if filter is not None: signal = ss.lfilter(coeff_b, coeff_a, signal) noise[i, :] = signal[10000:] noise[i, :] /= rms_flat(noise[i, :]) noise[i, :] *= weight return noise
def noise_brown(ncols, nrows=1, weight=1, filter=None, filterargs=None): '''Return 1/f^2 noise of shape(nrows, ncols obtained by taking the cumulative sum of gaussian white noise, with rms weight. If filter is not None, this function will apply the filter coefficients obtained by: :: >>> b, a = filter(**filterargs) >>> signal = scipy.signal.lfilter(b, a, signal) ''' from matplotlib.mlab import rms_flat if filter is not None: coeff_b, coeff_a = list(filter(**filterargs)) noise = np.empty((nrows, ncols)) for i in range(nrows): signal = np.random.normal(size=ncols+10000).cumsum() if filter is not None: signal = ss.lfilter(coeff_b, coeff_a, signal) noise[i, :] = signal[10000:] noise[i, :] /= rms_flat(noise[i, :]) noise[i, :] *= weight return noise
def binned_plot(x, y, bins, y_type='mean', figure=None, plot_counts=True): bins_array = np.array([min(x)-1e-14] + list(sorted(bins)) + [max(x)-1e-14]) binned_y = {} for i in xrange(len(x)): bin_index = max(np.nonzero(bins_array < x[i])[0]) binned_y.setdefault(bin_index, []).append(y[i]) y_count = [] y_vec = [] for j in xrange(len(bins) + 1): if j in binned_y: binned_y[j] = np.array(binned_y[j]) y_count.append(len(binned_y[j])) if y_type == 'mean': y_vec.append(np.mean(binned_y[j])) elif y_type == 'rmse': y_vec.append(rms_flat(binned_y[j])) elif y_type == 'std': y_vec.append(np.std(binned_y[j])) else: y_count.append(0) y_vec.append(0.0) bin_width = bins_array[1:] - bins_array[0:-1] bin_center = (bins_array[1:] + bins_array[0:-1])/2 if not figure: figure = plt.figure() figure.hold(True) plt.bar(left=bins_array[0:-1], height=y_vec, width=bin_width, figure=figure) for i in xrange(len(bins) + 1): if y_count[i] > 0: plt.text(bin_center[i], y_vec[i], '%d' % y_count[i], horizontalalignment='center', fontsize='small')
def crest_factor(signal): """ Crest factor of a 1D signal """ peak = np.amax(np.absolute(signal)) rms = rms_flat(signal) if rms == 0: rms = .000001 return peak / rms
def analyse_rotation(img, plot=False, line_spacing=False): # converting to grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) I = gray - mean( gray) # Demean; make the brightness extend above and below zero # Do the radon transform and display the result with warnings.catch_warnings(): # Some warning inside the radon warnings.simplefilter("ignore") sinogram = radon(I) # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines with warnings.catch_warnings( ): # rms_flat is deprecated but I didn't found an alternative. Still works though warnings.simplefilter("ignore") r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = 90 - argmax(r) # Plot the busy row row = sinogram[:, rotation] N = len(row) # Take spectrum of busy row and find line spacing window = blackman(N) spectrum = rfft(row * window) frequency = argmax(abs(spectrum)) if line_spacing: line_spacing = N / frequency # pixels print('Line spacing: {:.2f} pixels'.format(line_spacing)) if plot: print('Rotation: {:.2f} degrees'.format(rotation)) plt.subplot(2, 2, 1) plt.imshow(I) plt.subplot(2, 2, 2) plt.imshow(sinogram.T, aspect='auto') plt.gray() plt.axhline(rotation, color='r') plt.subplot(2, 2, 3) plt.plot(row) plt.plot(row * window) plt.subplot(2, 2, 4) plt.plot(abs(spectrum)) plt.axvline(frequency, color='r') plt.yscale('log') plt.show() return rotation
def rms_spectrum_test(song='tainted', tuning_f0=110., channel=0): """ Extract spectral RMS power for equal temperament pitches inputs: song - directory name of song (contains: song/mix_000.wav and song/mix_100.wav non-autotuned and autotuned mixes) tuning_f0 - lowest frequency to track melody (110Hz = A440Hz/4) [110] channel - whether to use 0=left, 1=right, or 2=both channels [0] outputs: dict {'nontuned_rms':df0, 'autotuned_rms':df1} energy (RMS power) at ideal pitch tuning freqs """ x0, sr, fmt = wavread(song + os.sep + 'mix_000.wav') x1, sr, fmt = wavread(song + os.sep + 'mix_100.wav') if channel == 2: # mix the channels if len(x0.shape) > 1: x0 = x0.mean(1) if len(x1.shape) > 1: x1 = x1.mean(1) else: # extract given channel if len(x0.shape) > 1: x0 = x0[:, channel] if len(x1.shape) > 1: x1 = x1[:, channel] # Short-time Fourier analysis F0 = LinearFrequencySpectrum(x0, nfft=8192, wfft=8192, nhop=2048) F1 = LinearFrequencySpectrum(x1, nfft=8192, wfft=8192, nhop=2048) eq_freqs = tuning_f0 * 2**(arange(0, 5, 1 / 12.)) eq_bins = array([argmin(abs(F0._fftfrqs - f)) for f in eq_freqs]) # df0 = normalize(F0.X)[eq_bins].mean(1) df0 = (normalize(F0.X)[eq_bins]**2).mean(1)**0.5 #df1 = nomalize(F1.X)[eq_bins].mean(1) df1 = (normalize(F1.X)[eq_bins]**2).mean(1)**0.5 figure() semilogx(F0._fftfrqs[eq_bins], df0) semilogx(F0._fftfrqs[eq_bins], df1) legend(['Original vocals', 'Autotuned vocals'], loc=0) title(song + ': ET bands untuned/tuned vocals mixed with background', fontsize=20) xlabel('Equal Temperament Bands (Hz)', fontsize=20) ylabel('Power', fontsize=20) grid() return {'nontuned_rms': rms_flat(df0), 'autotuned_rms': rms_flat(df1)}
def compute_skew(cls, image): image = image - np.mean(image) # Demean; make the brightness extend above and below zero # Do the radon transform and display the result sinogram = radon(image) # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = np.array([rms_flat(line) for line in sinogram.transpose()]) rotation = np.argmax(r) return (90 - rotation) / 100
def rms_spectrum_test(song='tainted', tuning_f0=110., channel=0): """ Extract spectral RMS power for equal temperament pitches inputs: song - directory name of song (contains: song/mix_000.wav and song/mix_100.wav non-autotuned and autotuned mixes) tuning_f0 - lowest frequency to track melody (110Hz = A440Hz/4) [110] channel - whether to use 0=left, 1=right, or 2=both channels [0] outputs: dict {'nontuned_rms':df0, 'autotuned_rms':df1} energy (RMS power) at ideal pitch tuning freqs """ x0, sr, fmt = wavread(song+os.sep+'mix_000.wav') x1, sr, fmt = wavread(song+os.sep+'mix_100.wav') if channel==2: # mix the channels if len(x0.shape) > 1: x0 = x0.mean(1) if len(x1.shape) > 1: x1 = x1.mean(1) else: # extract given channel if len(x0.shape) > 1: x0 = x0[:,channel] if len(x1.shape) > 1: x1 = x1[:,channel] # Short-time Fourier analysis F0 = LinearFrequencySpectrum(x0,nfft=8192,wfft=8192,nhop=2048) F1 = LinearFrequencySpectrum(x1,nfft=8192,wfft=8192,nhop=2048) eq_freqs = tuning_f0*2**(arange(0,5,1/12.)) eq_bins = array([argmin(abs(F0._fftfrqs-f)) for f in eq_freqs]) # df0 = normalize(F0.X)[eq_bins].mean(1) df0 = (normalize(F0.X)[eq_bins]**2).mean(1)**0.5 #df1 = nomalize(F1.X)[eq_bins].mean(1) df1 = (normalize(F1.X)[eq_bins]**2).mean(1)**0.5 figure() semilogx(F0._fftfrqs[eq_bins], df0) semilogx(F0._fftfrqs[eq_bins], df1) legend(['Original vocals','Autotuned vocals'],loc=0) title(song+': ET bands untuned/tuned vocals mixed with background', fontsize=20) xlabel('Equal Temperament Bands (Hz)',fontsize=20) ylabel('Power',fontsize=20) grid() return {'nontuned_rms':rms_flat(df0), 'autotuned_rms':rms_flat(df1)}
def Report(self, est, title): self.html_writer.write('</br><b>%s</b><br>\n' % title) finite = np.isfinite(est) resid = abs(self.b[finite] - est[finite]) fig = plt.figure(figsize=(5, 5), dpi=60) cdf(list(resid.flat), figure=fig) #plt.plot(self.b[finite].T, est[finite].T, '.', figure=fig) plt.title("RMSE = %.1f, N = %d" % (rms_flat(resid.flat), resid.shape[1])) plt.xlabel( r"$|\Delta_r G^{'\circ} obs - \Delta_r G^{'\circ} est|$ [kJ/mol]") plt.ylabel(r"CDF") self.html_writer.embed_matplotlib_figure(fig) rowdicts = [] for i in xrange(self.b.shape[1]): rowdict = {} rowdict['row'] = i rowdict['type'] = self.obs_types[i] rowdict['reaction'] = UnifiedGroupContribution.row2hypertext( self.S[:, i], self.cids) rowdict['anchored'] = self.anchored[0, i] rowdict['obs'] = self.b[0, i] rowdict['est'] = est[0, i] if np.isfinite(est[0, i]): rowdict['|err|'] = abs(self.b[0, i] - est[0, i]) else: rowdict['|err|'] = 0 rowdicts.append(rowdict) rowdicts.sort(key=lambda x: x['|err|'], reverse=True) self.html_writer.insert_toggle(start_here=True, label="Show table") self.html_writer.write_table(rowdicts, headers=[ 'row', 'type', 'reaction', 'anchored', 'obs', 'est', '|err|' ], decimal=1) self.html_writer.div_end()
def main(argv): filename = '' if len(sys.argv) < 3: print('Usage: rotation.py -f <filename>') sys.exit() try: opts, args = getopt.getopt(argv, "hf:", ["file="]) except getopt.GetoptError: print('rotation.py -f <filename>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('Usage: rotation.py -f <filename>') sys.exit() elif opt in ("-f", "--file"): filename = arg try: from parabolic import parabolic def argmax(x): return parabolic(x, numpy.argmax(x))[0] except ImportError: from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) I = I - mean(I) # Demean; make the brightness extend above and below zero # Do the radon transform and display the result sinogram = radon(I) # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = argmax(r) print('{:.2f}'.format(-(90 - rotation)))
def verify_formation(self, html_writer, thermodynamics, name=None): cid2errors = defaultdict(list) cid2refs = defaultdict(set) reaction2errors = defaultdict(list) reaction2refs = defaultdict(set) for row_data in self.SelectRowsFromNist(): dG0_est = row_data.PredictReactionEnergy(thermodynamics) if np.isnan(dG0_est): continue err = row_data.dG0_r - dG0_est for cid in row_data.GetAllCids(): cid2errors[cid].append(err) cid2refs[cid].add((row_data.ref_id, row_data.url)) reaction2errors[row_data.reaction].append(err) reaction2refs[row_data.reaction].add( (row_data.ref_id, row_data.url)) rowdicts = [] for cid, err_list in cid2errors.iteritems(): refs = cid2refs[cid] urls = ', '.join([ '<a href="%s">%s</a>' % (url, ref_id) for ref_id, url in refs ]) rowdict = { 'cid': 'C%05d' % cid, 'name': self.kegg.cid2name(cid), 'RMSE': rms_flat(err_list), 'E[err]': np.mean(err_list), '#err': len(err_list), 'std[err]': np.std(err_list), 'URLs': urls } rowdicts.append(rowdict) rowdicts.sort(key=lambda x: x['RMSE'], reverse=True) html_writer.write_table( rowdicts, ['#', 'cid', 'name', 'RMSE', '#err', 'E[err]', 'std[err]', 'URLs'], decimal=1) rowdicts = [] for reaction, err_list in reaction2errors.iteritems(): refs = reaction2refs[reaction] urls = ', '.join([ '<a href="%s">%s</a>' % (url, ref_id) for ref_id, url in refs ]) rowdict = { 'reaction': reaction.to_hypertext(show_cids=False), 'RMSE': rms_flat(err_list), 'E[err]': np.mean(err_list), '#err': len(err_list), 'std[err]': np.std(err_list), 'URLs': urls } rowdicts.append(rowdict) rowdicts.sort(key=lambda x: x['RMSE'], reverse=True) html_writer.write_table( rowdicts, ['#', 'reaction', 'RMSE', '#err', 'E[err]', 'std[err]', 'URLs'], decimal=1)
def main(argv): filename = '' if len(sys.argv) < 3: print('Usage: rotation_spacing.py -f <filename>') sys.exit() try: opts, args = getopt.getopt(argv, "hf:", ["file="]) except getopt.GetoptError: print('rotation_spacing.py -f <filename>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('Usage: rotation_spacing.py -f <filename>') sys.exit() elif opt in ("-f", "--file"): filename = arg try: # More accurate peak finding from # https://gist.github.com/endolith/255291#file-parabolic-py from parabolic import parabolic def argmax(x): return parabolic(x, numpy.argmax(x))[0] except ImportError: from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) I = I - mean(I) # Demean; make the brightness extend above and below zero #plt.subplot(2, 2, 1) #plt.imshow(I) # Do the radon transform and display the result sinogram = radon(I) #plt.subplot(2, 2, 2) #plt.imshow(sinogram.T, aspect='auto') #plt.gray() # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = argmax(r) #print('Rotation: {:.2f} degrees'.format(90 - rotation)) ''' rotation = 90 - rotation rotation = -rotation print('{:.2f}'.format(rotation)) ''' print('{:.2f}'.format(-(90 - rotation))) #plt.axhline(rotaotion, color='r') # Plot the busy row row = sinogram[:, rotation] N = len(row) #plt.subplot(2, 2, 3) #plt.plot(row) # Take spectrum of busy row and find line spacing window = blackman(N) spectrum = rfft(row * window) #plt.plot(row * window) frequency = argmax(abs(spectrum)) line_spacing = N / frequency # pixels
def main(): kegg = Kegg.getInstance() prefix = '../res/prc_' fixed_cids = {} # a dictionary from CID to pairs of (nH, dG0) # Alberty formation energies directly measured, linearly independent: fixed_cids[1] = (2, -237.19) # H2O fixed_cids[9] = (1, -1096.1) # HPO3(-2) fixed_cids[14] = (4, -79.31) # NH4(+1) fixed_cids[59] = (0, -744.53) # SO4(-2) fixed_cids[288] = (1, -586.77) # HCO3(-1) # Alberty zeros: fixed_cids[3] = (26, 0.0) # NAD(ox) fixed_cids[10] = (32, 0.0) # CoA fixed_cids[127] = (30, 0.0) # glutathione(ox) fixed_cids[376] = (28, 0.0) # retinal(ox) # Directly measured values fixed_cids[4] = (27, 22.65) # NAD(red) -- relative to NAD(ox) fixed_cids[212] = (13, -194.5) # adenosine #fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors' # Alberty zeros which are not in NIST: #fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox) #fixed_cids[16] = (31, 0.0) # FAD(ox) #fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox) #fixed_cids[61] = (19, 0.0) # FMN(ox) #fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox) #fixed_cids[399] = (90, 0.0) # ubiquinone(ox) public_db = SqliteDatabase("../data/public_data.sqlite") alberty = PsuedoisomerTableThermodynamics.FromDatabase( public_db, 'alberty_pseudoisomers', label=None, name='Alberty') alberty_cid2dG0 = {} alberty_cid2nH = {} for cid in alberty.get_all_cids(): pmap = alberty.cid2PseudoisomerMap(cid) dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer( pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) alberty_cid2nH[cid] = nH alberty_cid2dG0[cid] = dG0 if not os.path.exists(prefix + 'S.txt'): db = SqliteDatabase("../res/gibbs.sqlite") nist_regression = NistRegression(db) cid2nH = {} for cid in nist_regression.nist.GetAllCids(): if cid in fixed_cids: cid2nH[cid] = fixed_cids[cid][0] elif cid in alberty_cid2nH: cid2nH[cid] = alberty_cid2nH[cid] else: tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer( cid, pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) if tmp is not None: cid2nH[cid] = tmp[0] else: logging.warning( 'The most abundant pseudoisomer of %s (C%05d) ' 'cannot be resolved. Using nH = 0.' % (kegg.cid2name(cid), cid)) cid2nH[cid] = 0 #nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction #nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40) S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH) # export the raw data matrices to text files C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids]) np.savetxt(prefix + 'CID.txt', C, fmt='%d', delimiter=',') np.savetxt(prefix + 'S.txt', S, fmt='%g', delimiter=',') np.savetxt(prefix + 'dG0.txt', dG0, fmt='%.2f', delimiter=',') else: C = np.loadtxt(prefix + 'CID.txt', delimiter=',') cids = [int(cid) for cid in C[:, 0]] cid2nH = {} for i, cid in enumerate(cids): cid2nH[cid] = int(C[i, 1]) S = np.loadtxt(prefix + 'S.txt', delimiter=',') dG0 = np.loadtxt(prefix + 'dG0.txt', delimiter=',') dG0 = np.reshape(dG0, (dG0.shape[0], 1)) html_writer = HtmlWriter('../res/regression_fast.html') html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n") html_writer.write("<p>The stoichiometric matrix (S):") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, S, cids) html_writer.div_end() html_writer.write('</p>') index2value = {} S_extended = S # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds for cid in fixed_cids.keys(): i = cids.index(cid) e_i = np.zeros((1, len(cids))) e_i[0, i] = 1.0 S_extended = np.vstack([S_extended, e_i]) nH, dG0_fixed = fixed_cids[cid] index2value[i] = dG0_fixed x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value) cid2dG0 = {} for i, cid in enumerate(cids): cid2dG0[cid] = x[i] # Calculate the Kernel of the reduced stoichiometric matrix (after removing # the columns of the fixed compounds). cids_red = [cid for cid in cids if cid not in fixed_cids] index_red = [i for i in xrange(len(cids)) if i not in index2value] S_red = S[:, index_red] K_red = LinearRegression.Kernel(S_red) #print "Reduced Stoichiometric Matrix:" #print matrix2string(S_red, cids_red, kegg) #print '-'*80 # Find all CIDs that are completely determined and do not depend on any # free variable. In other words, all zeros columns in K2. dict_list = [] determined_indices = np.where( np.sum(abs(K_red), 0) < 1e-10)[0] # all zero-columns in reducedK determined_cids = [cids_red[i] for i in determined_indices] plot_data = [] for i, cid in enumerate(cids): d = { 'CID': 'C%05d' % cid, 'Compound': kegg.cid2name(cid), 'nH': '%d' % cid2nH[cid], 'dG0 (PRC)': '%.1f' % cid2dG0[cid] } if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] if cid not in fixed_cids: plot_data.append( (alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid))) else: d['dG0 (Alberty)'] = '' if cid in fixed_cids: d['Depends on'] = 'anchored' elif cid in determined_cids: d['Depends on'] = 'fixed compounds' else: d['Depends on'] = 'kernel dimensions' dict_list.append(d) dict_list.sort(key=lambda (x): (x['Depends on'], x['CID'])) html_writer.write( "<p>Formation energies determined by the linear constraints:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'Compound', 'CID', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)', 'Depends on' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') # Plot a comparison between PRC and Alberty formation energies fig = plt.figure(figsize=(8, 8), dpi=80) plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], 'b.', figure=fig) for x, y, name in plot_data: plt.text(x, y, name, fontsize=6) plt.xlabel('Alberty $\Delta_f G^\circ$') plt.ylabel('PRC $\Delta_f G^\circ$') html_writer.write("<p>Plot comparing PRC and Alberty results:") html_writer.insert_toggle(start_here=True) html_writer.embed_matplotlib_figure(fig) html_writer.div_end() html_writer.write("</p>") K_sparse = SparseKernel(S_red).Solve() html_writer.write( "<p>The sparse null-space of the reduced stoichiometric matrix:") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, K_sparse, cids_red) html_writer.div_end() html_writer.write("</p>") dict_list = [] index2string_html = dict( (i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0])) index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0])) for i, cid in enumerate(cids_red): d = {} d['KEGG ID'] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid) d['KEGG ID plain'] = 'C%05d' % cid d['Compound'] = kegg.cid2name(cid) d['nH'] = '%d' % cid2nH[cid] if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] else: d['dG0 (Alberty)'] = '' d['dG0 (PRC)'] = '%.1f' % cid2dG0[cid] d['dG0 (PRC) plain'] = '%.1f' % cid2dG0[cid] indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist() indic.reverse() d['order_key'] = indic if mlab.rms_flat(K_sparse[:, i]) > 1e-10: d['dG0 (PRC)'] += " + (" + vector2string(K_sparse[:, i], index2string_html) + ")" d['dG0 (PRC) plain'] += " + (" + vector2string( K_sparse[:, i], index2string) + ")" dict_list.append(d) dict_list.sort(key=lambda (d): (d['order_key'], d['KEGG ID plain'])) # Export the results to CSV csv_writer = csv.writer(open('../res/prc_results.csv', 'w')) csv_writer.writerow( ['KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)']) for d in dict_list: csv_writer.writerow([ d['KEGG ID plain'], d['Compound'], d['nH'], d['dG0 (PRC) plain'], d['dG0 (Alberty)'] ]) html_writer.write( "<p>All formation energies as a function of the free variables:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') fp = open('../res/prc_latex.txt', 'w') fp.write( latex.table2LaTeX(dict_list, headers=[ '#', 'KEGG ID plain', 'Compound', 'nH', 'dG0 (PRC) plain', 'dG0 (Alberty)' ])) fp.close()
if LinearRegression.MatrixRank(self.K) < self.dimension + 1: self.K[self.dimension, :] = 0 else: # normalize the kernel vector so that it will have nice coefficients g = min(abs(coeffs[nonzero_indices])) self.K[self.dimension, :] /= g #if sum(self.K[:, self.dimension] < 0.0): # self.K[:, self.dimension] *= -1.0 v = self.K[self.dimension, :] self.AddLinearConstraint(v) self.dimension += 1 return v def Solve(self): if self.dimension == 0: for _ in self: pass return self.K if __name__ == '__main__': A = np.array([[1, 0, 1, 1, 2, 1, 1], [0, 1, 1, 1, 2, 1, 1], [1, 1, 2, 2, 4, 2, 2]]) K = SparseKernel(A) print A for v in K: print "nullvector: ", ', '.join(['%g' % x for x in v]) print "RMS(A*K.T) =", mlab.rms_flat(np.dot(A, K.Solve().T))
def AnalyzeTrainingSet(self, skip_formations=True): n_obs = self.group_matrix.shape[1] rowdicts = [] fit_results = np.dot(self.group_contributions, self.group_matrix) residuals = fit_results - self.obs_values if self.transformed: sym = symbol_d_G0_prime else: sym = symbol_d_G0 for i in xrange(n_obs): if self.obs_types[i] in [KeggObservation.TYPE_ACID_BASE, KeggObservation.TYPE_MG, KeggObservation.TYPE_REDOX]: continue if skip_formations and self.obs_types[i] == KeggObservation.TYPE_FORMATION: continue rowdict = {'Observation':self.obs_ids[i]} rowdict[sym + ' (obs)'] = self.obs_values[0, i] rowdict[sym + ' (fit)'] = fit_results[0, i] rowdict[sym + ' (res)'] = residuals[0, i] rowdict['LOO ' + sym + ' (fit)'] = np.nan rowdict['LOO ' + sym + ' (res)'] = np.nan rowdict['sortkey'] = 0 rowdicts.append(rowdict) logging.info('Fit Error = %.1f' % residuals[0, i]) # leave out the row corresponding with observation 'i' logging.info('Cross validation, leaving-one-out: ' + self.obs_ids[i]) subset = range(n_obs) subset.pop(i) loo_group_contributions, loo_nullspace = LinearRegression.LeastSquares( self.group_matrix[:, subset], self.obs_values[:, subset]) if loo_nullspace.shape[1] > self.group_nullspace.shape[1]: logging.warning('example %d is not linearly dependent in the other examples' % i) continue rowdict['LOO ' + sym + ' (fit)'] = float(np.dot(loo_group_contributions, self.group_matrix[:, i])) rowdict['LOO ' + sym + ' (res)'] = \ rowdict['LOO ' + sym + ' (fit)'] - self.obs_values[0, i] rowdict['sortkey'] = abs(rowdict['LOO ' + sym + ' (res)']) logging.info('LOO Error = %.1f' % rowdict['LOO ' + sym + ' (res)']) logging.info("writing the table of estimation errors for each compound") self.html_writer.write('</br><b>Cross validation table</b>') self.html_writer.insert_toggle(start_here=True) self.html_writer.write('<font size="1">\n') obs_vec = np.matrix([row[sym + ' (obs)'] for row in rowdicts]) resid_vec = np.matrix([row[sym + ' (res)'] for row in rowdicts]) rmse = rms_flat(resid_vec.flat) loo_resid_vec = np.matrix([row['LOO ' + sym + ' (res)'] for row in rowdicts]) loo_rmse = rms_flat(loo_resid_vec[np.isfinite(loo_resid_vec)].flat) self.html_writer.write_ul(['fit RMSE = %.1f [kJ/mol]' % rmse, 'leave-one-out RMSE = %.1f [kJ/mol]' % loo_rmse]) logging.info("Goodness of fit: RMSE = %.1f [kJ/mol]" % rmse) logging.info("Leave-one-out test: RMSE = %.1f [kJ/mol]" % loo_rmse) headers = ['Observation', sym + ' (obs)', sym + ' (fit)', sym + ' (res)', 'LOO ' + sym + ' (fit)', 'LOO ' + sym + ' (res)'] rowdicts.sort(key=lambda(x):x['sortkey'], reverse=True) self.html_writer.write_table(rowdicts, headers, decimal=1) self.html_writer.write('</font>\n') self.html_writer.div_end() self.html_writer.write('</br><b>Cross-validation figure</b>') self.html_writer.insert_toggle(start_here=True) obs_vs_err_fig = plt.figure(figsize=[6.0, 6.0], dpi=100) plt.plot(obs_vec.T, resid_vec.T, '.') plt.xlabel('Observation') plt.ylabel('Estimated (PGC) Residuals') plt.hold(True) for row in rowdicts: if abs(row[sym + ' (res)']) > 2*rmse: plt.text(row[sym + ' (obs)'], row[sym + ' (res)'], row['Observation'], fontsize=4, figure=obs_vs_err_fig) plt.title('Observed vs. Fitted (PGC) Residuals', figure=obs_vs_err_fig) self.html_writer.embed_matplotlib_figure(obs_vs_err_fig) self.html_writer.div_end()
def two_way_comparison(html_writer, thermo1, thermo2, reaction_list, name=None): """ Compare the estimation errors of two different evaluation methods. Write results to HTML. Args: thermo1: a Thermodynamics object that provides dG estimates. thermo2: a Thermodynamics object that provides dG estimates. """ pH, pMg, I, T = (7, 14, 0.1, 298.15) total_list = [] for reaction in reaction_list: try: dG0_pred1 = reaction.PredictReactionEnergy(thermo1, pH=pH, pMg=pMg, I=I, T=T) dG0_pred2 = reaction.PredictReactionEnergy(thermo2, pH=pH, pMg=pMg, I=I, T=T) except MissingReactionEnergy: continue total_list.append([dG0_pred1, dG0_pred2, reaction]) if not total_list: return 0, 0 # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 8 plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 2 plt.rcParams['figure.dpi'] = 100 data_mat = np.array([(x[0], x[1]) for x in total_list]) non_nan = list(np.isfinite(data_mat.sum(1)).nonzero()[0].flat) fig2 = plt.figure(figsize=(5,5)) plt.plot(data_mat[non_nan,0], data_mat[non_nan,1], 'b.') rmse = rms_flat((data_mat[non_nan,0] - data_mat[non_nan,1]).flat) plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (rmse)) plt.xlabel(r'$\Delta G_r^\circ$ from %s [kJ/mol]' % thermo1.name) plt.ylabel(r'$\Delta G_r^\circ$ from %s [kJ/mol]' % thermo2.name) plt.plot([-200, 200], [-200, 200], 'k--') plt.axis([-200, 200, -200, 200]) html_writer.embed_matplotlib_figure(fig2, name=name+"_eval") table_headers = ["#", '|diff|', "dG'0 (%s)" % thermo1.name, "dG'0 (%s)" % thermo2.name,\ "reaction", "rid"] dict_list = [] for row in total_list: d = {} if np.isnan(row[0]) or np.isnan(row[1]): d["|diff|"] = 0 else: d["|diff|"] = abs(row[0] - row[1]) d["dG'0 (%s)" % thermo1.name] = row[0] d["dG'0 (%s)" % thermo2.name] = row[1] d['reaction'] = row[2].to_hypertext(show_cids=True) if row[2].rid is not None: d['rid'] = '<a href="%s">R%05d</a>' % (row[2].get_link(), row[2].rid) else: d['rid'] = '' dict_list.append(d) dict_list.sort(key=lambda d:d['|diff|'], reverse=True) html_writer.write_table(dict_list, table_headers, decimal=1)
def LinearRegression(self, S, obs_dG0_r, cids, cid2nH_nMg, prior_thermodynamics=None): logging.info("Regression matrix is %d x %d" % \ (S.shape[0], S.shape[1])) cid2ref = dict((cid, 'PRC') for cid in cids) if prior_thermodynamics: # Normalize the contribution of compounds which have formation energies # given in the prior. Perform the regression only on the residuals # remaining after the normalization (note that the stoichiometric # matrix must also be trimmed). cid_index_prior = [] dG0_prior = [] for i, cid in enumerate(cids): nH, nMg = cid2nH_nMg[cid] try: pmap_prior = prior_thermodynamics.cid2PseudoisomerMap(cid) except MissingCompoundFormationEnergy: continue for p_nH, p_z, p_nMg, dG0 in pmap_prior.ToMatrix(): if nH == p_nH and p_nMg == nMg: cid_index_prior.append(i) dG0_prior.append(dG0) cid2ref[cid] = pmap_prior.GetRef(p_nH, p_z, p_nMg) break S_prior = np.matrix(np.zeros((len(cids), len(cid_index_prior)))) for j, i in enumerate(cid_index_prior): S_prior[i, j] = 1 dG0_prior = np.matrix(dG0_prior) g, _ = LinearRegression.LeastSquares(S_prior, dG0_prior) P_C, P_L = LinearRegression.ColumnProjection(S_prior) prior_dG0_r = g * P_C * S new_obs_dG0_r = obs_dG0_r - prior_dG0_r new_S = P_L * S # Find all reactions in new_S which are completely zero. This means that # they are completely determined by the prior. zero_cols = (abs(new_S).sum(0) < 1e-10).nonzero()[1] rowdicts = [] for j in zero_cols.flat: rowdict = {} rowdict['reaction'] = NistRegression.row2hypertext(S[:, j], cids) rowdict['|error|'] = abs(new_obs_dG0_r[0, j]) rowdict['error'] = new_obs_dG0_r[0, j] rowdict['NIST'] = obs_dG0_r[0, j] rowdict['prior'] = prior_dG0_r[0, j] rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['|error|'], reverse=True) self.html_writer.write('</br><b>Alberty Errors</b>\n') self.html_writer.write_table(rowdicts, headers=['reaction', 'error', 'NIST', 'prior'], decimal=1) est_dG0_f, _ = LinearRegression.LeastSquares(new_S, new_obs_dG0_r) for j, i in enumerate(cid_index_prior): est_dG0_f[0, i] = dG0_prior[0, j] else: est_dG0_f, _ = LinearRegression.LeastSquares(S, obs_dG0_r) est_dG0_r = est_dG0_f * S residuals = est_dG0_r - obs_dG0_r rmse = rms_flat(residuals.flat) logging.info("Regression results for reverse transformed data:") logging.info("N = %d, RMSE = %.1f" % (S.shape[1], rmse)) self.html_writer.write('<p>RMSE = %.1f [kJ/mol]</p>\n' % rmse) rowdicts = [] headers = ['#', 'Reaction', symbol_dr_G0 + ' (obs)', symbol_dr_G0 + ' (fit)', symbol_dr_G0 + ' (res)'] for i in xrange(S.shape[1]): rowdict = {} rowdict['Reaction'] = NistRegression.row2hypertext(S[:, i], cids) rowdict[symbol_dr_G0 + ' (obs)'] = obs_dG0_r[0, i] rowdict[symbol_dr_G0 + ' (fit)'] = est_dG0_r[0, i] rowdict[symbol_dr_G0 + ' (res)'] = residuals[0, i] rowdicts.append(rowdict) rowdicts.sort(key=lambda x:abs(x[symbol_dr_G0 + ' (res)']), reverse=True) self.html_writer.write_table(rowdicts, headers, decimal=1) # copy the solution into the diss_tables of all the compounds, # and then generate their PseudoisomerMaps. for i, cid in enumerate(cids): nH, nMg = cid2nH_nMg[cid] diss_table = self.GetDissociation().GetDissociationTable(cid) z = diss_table.min_charge + (nH - diss_table.min_nH) diss_table.SetFormationEnergyByNumHydrogens(est_dG0_f[0, i], nH, nMg) pmap = diss_table.GetPseudoisomerMap(nH, nMg) pmap.SetRef(nH, z, nMg, cid2ref[cid]) self.cid2pmap_dict[cid] = pmap
nonzero_indices = np.nonzero(g_plus > 0.5)[0].tolist() + np.nonzero(g_minus > 0.5)[0].tolist() self.K[self.dimension, nonzero_indices] = coeffs[nonzero_indices] if LinearRegression.MatrixRank(self.K) < self.dimension+1: self.K[self.dimension, :] = 0 else: # normalize the kernel vector so that it will have nice coefficients g = min(abs(coeffs[nonzero_indices])) self.K[self.dimension, :] /= g #if sum(self.K[:, self.dimension] < 0.0): # self.K[:, self.dimension] *= -1.0 v = self.K[self.dimension, :] self.AddLinearConstraint(v) self.dimension += 1 return v def Solve(self): if self.dimension == 0: for _ in self: pass return self.K if __name__ == '__main__': A = np.array([[1, 0, 1, 1, 2, 1, 1],[0, 1, 1, 1, 2, 1, 1],[1, 1, 2, 2, 4, 2, 2]]) K = SparseKernel(A) print A for v in K: print "nullvector: ", ', '.join(['%g' % x for x in v]) print "RMS(A*K.T) =", mlab.rms_flat(np.dot(A, K.Solve().T))
def verify_results(self, key, thermodynamics, html_writer): """Calculate all the dG0_r for the reaction from NIST and compare to the measured data. Write results to HTML. Args: key: The name of this group of results. thermodynamics: a Thermodynamics object that provides dG estimates. html_writer: to write HTML. ignore_I: whether or not to ignore the ionic strength in NIST. """ logging.info("calculate the correlation between %s's predictions and the NIST database" % key) known_cid_set = thermodynamics.get_all_cids() dG0_obs_vec = [] dG0_est_vec = [] # A mapping from each evaluation method (NIST calls separates them to # A, B, C and D) to the results of the relevant measurements evaluation_map = {} total_list = [] cid2count = {} for row_data in self.data: for cid in row_data.GetAllCids(): cid2count[cid] = cid2count.setdefault(cid, 0) + 1 for row_data in self.data: unknown_set = set(row_data.GetAllCids()).difference(known_cid_set) if unknown_set: logging.debug("a compound in (%s) doesn't have a dG0_f" % row_data.origin) continue #label = row_data.evaluation label = row_data.K_type if label not in evaluation_map: evaluation_map[label] = ([], []) try: dG0_pred = row_data.PredictReactionEnergy(thermodynamics) except MissingCompoundFormationEnergy: logging.debug("a compound in (%s) doesn't have a dG0_f" % row_data.origin) continue dG0_obs_vec.append(row_data.dG0_r) dG0_est_vec.append(dG0_pred) evaluation_map[label][0].append(row_data.dG0_r) evaluation_map[label][1].append(dG0_pred) n_measurements = min([cid2count[cid] for cid in row_data.GetAllCids()]) error = abs(row_data.dG0_r - dG0_pred) total_list.append([error, row_data.dG0_r, dG0_pred, row_data.sparse, row_data.pH, row_data.pMg, row_data.I, row_data.T, row_data.evaluation, n_measurements]) # plot the profile graph rcParams['text.usetex'] = False rcParams['legend.fontsize'] = 12 rcParams['font.family'] = 'sans-serif' rcParams['font.size'] = 16 rcParams['lines.linewidth'] = 2 rcParams['lines.markersize'] = 3 rcParams['figure.figsize'] = [8.0, 6.0] rcParams['figure.dpi'] = 100 fig1 = figure() hold(True) colors = ['purple', 'orange', 'lightgreen', 'red', 'cyan'] for e in sorted(evaluation_map.keys()): measured, predicted = evaluation_map[e] resid = np.array(measured) - np.array(predicted) label = '%s (N = %d, RMSE = %.2f [kJ/mol])' % (e, len(measured), rms_flat(resid.flat)) c = colors.pop(0) plot(measured, predicted, marker='.', linestyle='None', markerfacecolor=c, markeredgecolor=c, markersize=5, label=label) legend(loc='upper left') resid = np.array(dG0_obs_vec) - np.array(dG0_est_vec) rmse = rms_flat(resid.flat) title(r'N = %d, RMSE = %.1f [kJ/mol]' % (len(dG0_obs_vec), rmse), fontsize=14) xlabel(r'$\Delta_{obs} G^\circ$ [kJ/mol]', fontsize=14) ylabel(r'$\Delta_{est} G^\circ$ [kJ/mol]', fontsize=14) min_x = min(dG0_obs_vec) max_x = max(dG0_obs_vec) plot([min_x, max_x], [min_x, max_x], 'k--') axis([-60, 60, -60, 60]) fig2 = figure() hist([(row[1] - row[2]) for row in total_list], bins=arange(-50, 50, 0.5)) title(r'RMSE = %.1f [kJ/mol]' % rmse, fontsize=14) xlabel(r'$\Delta_{obs} G^\circ - \Delta_{est} G^\circ$ [kJ/mol]', fontsize=14) ylabel(r'no. of measurements', fontsize=14) fig3 = figure() plot([row[9] for row in total_list], [abs(row[1] - row[2]) for row in total_list], '.') title(r'The effect of the number of measurements on the estimation error' % rmse, fontsize=14) xlabel(r'minimum no. of measurements among reaction compounds', fontsize=14) ylabel(r'$|| \Delta_{obs} G^\circ - \Delta_{est} G^\circ ||$ [kJ/mol]', fontsize=14) xscale('log') html_writer.write("<h2>%s</h2>" % key) html_writer.embed_matplotlib_figure(fig1, width=400, height=300) html_writer.embed_matplotlib_figure(fig2, width=400, height=300) html_writer.write('<input type="button" class="button" onclick="return toggleMe(\'%s\')" value="Show">\n' % (key)) html_writer.write('<div id="%s" style="display:none">' % key) html_writer.embed_matplotlib_figure(fig3, width=400, height=300) table_headers = ["|error|", "dG0(obs)", "dG0(pred)", "reaction", "pH", "pMg", "I", "T", "evaluation", "min_num_measurements"] html_writer.write("<table>\n") html_writer.write("<tr><td>" + "</td><td>".join(table_headers) + "</td></tr>\n") for row in sorted(total_list, reverse=True): sparse_reaction = row[3] row[3] = self.kegg.sparse_to_hypertext(sparse_reaction, show_cids=False) html_writer.write("<tr><td>" + "</td><td>".join(["%.1f" % x for x in row[:3]] + [str(x) for x in row[3:]]) + "</td></tr>\n") html_writer.write("</table>\n") html_writer.write("</div><br>\n")
def verify_results(self, html_writer, thermodynamics, name=None): """Calculate all the dG0_r for the reaction from NIST and compare to the measured data. Write results to HTML. Args: thermodynamics: a Thermodynamics object that provides dG estimates. ignore_I: whether or not to ignore the ionic strength in NIST. """ dG0_obs_vec = [] dG0_est_vec = [] # A mapping from each evaluation method (NIST calls separates them to # A, B, C and D) to the results of the relevant measurements evaluation_map = {} rowdicts = [] finite_rowdicts = [] eval_to_label = { 'A': 'high quality', 'B': 'low quality', 'C': 'low quality', 'D': 'low quality', 'E': 'low quality' } for row_data in self.SelectRowsFromNist(): rowdict = {} label = eval_to_label[row_data.evaluation] if label not in evaluation_map: evaluation_map[label] = ([], []) rowdict[symbol_dr_G0_prime + ' (obs)'] = np.round( row_data.dG0_r, 1) rowdict['_reaction'] = row_data.reaction rowdict['reaction'] = row_data.reaction.to_hypertext( show_cids=False) if row_data.reaction.rid is not None: rowdict['rid'] = '<a href="%s">R%05d</a>' % ( row_data.reaction.get_link(), row_data.reaction.rid) else: rowdict['rid'] = '' rowdict['pH'] = row_data.pH rowdict['pMg'] = row_data.pMg rowdict['I'] = row_data.I rowdict['T'] = row_data.T rowdict['eval.'] = row_data.evaluation rowdict['url'] = '<a href="%s">%s</a>' % (row_data.url, row_data.ref_id) dG0_est = row_data.PredictReactionEnergy(thermodynamics) if np.isfinite(dG0_est): dG0_obs_vec.append(row_data.dG0_r) dG0_est_vec.append(dG0_est) evaluation_map[label][0].append(row_data.dG0_r) evaluation_map[label][1].append(dG0_est) rowdict[symbol_dr_G0_prime + ' (est)'] = np.round(dG0_est, 1) rowdict['residual'] = np.round(row_data.dG0_r - dG0_est, 3) rowdict['|error|'] = abs(rowdict['residual']) rowdict['sort_key'] = -rowdict['|error|'] finite_rowdicts.append(rowdict) else: rowdict['sort_key'] = 1 rowdicts.append(rowdict) rowdicts.sort(key=lambda x: x['sort_key']) if not dG0_obs_vec: return 0, 0 unique_reaction_dict = defaultdict(list) for rowdict in finite_rowdicts: unique_reaction_dict[rowdict['_reaction']].append( rowdict['|error|']) unique_rmse_list = [ rms_flat(error_list) for error_list in unique_reaction_dict.values() ] unique_rmse = rms_flat(unique_rmse_list) resid = np.array(dG0_obs_vec) - np.array(dG0_est_vec) rmse = rms_flat(resid.flat) # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['legend.fontsize'] = 10 plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 12 plt.rcParams['lines.linewidth'] = 1 plt.rcParams['lines.markersize'] = 3 fig1 = plt.figure(figsize=(6, 6), dpi=90) plt.hold(True) colors = ['purple', 'orange'] for i, label in enumerate(sorted(evaluation_map.keys())): measured, predicted = evaluation_map[label] plt.plot(measured, predicted, marker='.', linestyle='None', markerfacecolor=colors[i], markeredgecolor=colors[i], markersize=5, label=label, figure=fig1) plt.legend(loc='lower right') plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (unique_rmse), fontsize=14, figure=fig1) plt.xlabel(r'observed $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig1) plt.ylabel(r'estimated $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig1) #min_x = min(dG0_obs_vec) #max_x = max(dG0_obs_vec) plt.plot([-60, 60], [-60, 60], 'k--', figure=fig1) plt.axis([-60, 60, -60, 60]) if name: html_writer.embed_matplotlib_figure(fig1, name=name + "_eval") else: html_writer.embed_matplotlib_figure(fig1) fig2 = plt.figure(figsize=(6, 6), dpi=90) binned_plot(x=[rowdict['pH'] for rowdict in finite_rowdicts], y=[rowdict['|error|'] for rowdict in finite_rowdicts], bins=[5, 6, 7, 8, 9], y_type='rmse', figure=fig2) plt.xlim((4, 11)) plt.ylim((0, 12)) plt.title(r'effect of pH', fontsize=14, figure=fig2) plt.xlabel('pH', fontsize=14, figure=fig2) plt.ylabel(r'RMSE ($\Delta_r G^{\'\circ}$) [kJ/mol]', fontsize=14, figure=fig2) if name: html_writer.embed_matplotlib_figure(fig2, name=name + "_pH") else: html_writer.embed_matplotlib_figure(fig2) fig3 = plt.figure(figsize=(6, 6), dpi=90) plt.hist([rowdict['residual'] for rowdict in finite_rowdicts], bins=np.arange(-50, 50, 0.5)) plt.title(r'RMSE = %.1f [kJ/mol]' % rmse, fontsize=14, figure=fig3) plt.xlabel(r'residual $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig3) plt.ylabel(r'no. of measurements', fontsize=14, figure=fig3) if name: html_writer.embed_matplotlib_figure(fig3, name=name + "_hist") else: html_writer.embed_matplotlib_figure(fig3) table_headers = [ "#", "|error|", symbol_dr_G0_prime + " (obs)", symbol_dr_G0_prime + " (est)", "reaction", "rid", "pH", "pMg", "I", "T", "eval.", "url" ] html_writer.write_table(rowdicts, table_headers, decimal=1) return len(dG0_obs_vec), unique_rmse
def two_way_comparison(self, html_writer, thermo1, thermo2, name=None): """ Compare the estimation errors of two different evaluation methods. Write results to HTML. Args: thermo1: a Thermodynamics object that provides dG estimates. thermo2: a Thermodynamics object that provides dG estimates. """ total_list = [] for row_data in self.SelectRowsFromNist(): try: dG0_pred1 = row_data.PredictReactionEnergy(thermo1) dG0_pred2 = row_data.PredictReactionEnergy(thermo2) except MissingReactionEnergy as e: logging.debug("the reaction in (%s) cannot be estimated: %s" % (row_data.ref_id, str(e))) continue total_list.append([ row_data.dG0_r, dG0_pred1, dG0_pred2, row_data.reaction, row_data.pH, row_data.pMg, row_data.I, row_data.T, row_data.evaluation, row_data.url ]) if not total_list: return 0, 0 # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 8 plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 2 plt.rcParams['figure.dpi'] = 100 data_mat = np.array(total_list) fig1 = plt.figure(figsize=(4, 4)) plt.hold(True) error1 = data_mat[:, 0] - data_mat[:, 1] error2 = data_mat[:, 0] - data_mat[:, 2] max_err = max(error1.max(), error2.max()) min_err = min(error1.min(), error2.min()) plt.plot([min_err, max_err], [min_err, max_err], 'k--', figure=fig1) plt.plot(error1, error2, '.', figure=fig1) plt.title("Error Comparison per Reaction (in kJ/mol)") plt.xlabel(thermo1.name, figure=fig1) plt.ylabel(thermo2.name, figure=fig1) html_writer.embed_matplotlib_figure(fig1, name=name + "_corr") fig2 = plt.figure(figsize=(7, 3)) for i, thermo in enumerate([thermo1, thermo2]): fig2.add_subplot(1, 2, i + 1) plt.plot(data_mat[:, 0], data_mat[:, i + 1], 'b.') rmse = rms_flat((data_mat[:, 0] - data_mat[:, i + 1]).flat) plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (rmse)) plt.xlabel(r'observed $\Delta G_r^\circ$ from NIST [kJ/mol]') plt.ylabel(r'estimated $\Delta G_r^\circ$ using %s [kJ/mol]' % thermo.name) plt.plot([-60, 60], [-60, 60], 'k--') plt.axis([-60, 60, -60, 60]) html_writer.embed_matplotlib_figure(fig2, name=name + "_eval") table_headers = [ "dG'0 (obs)", "dG'0 (%s)" % thermo1.name, "dG'0 (%s)" % thermo2.name, "reaction", "rid", "pH", "pMg", "I", "T", "eval.", "url" ] dict_list = [] for row in sorted(total_list, key=lambda (x): abs(x[1] - x[2]), reverse=True): d = {} d["dG'0 (obs)"] = '%.1f' % row[0] d["dG'0 (%s)" % thermo1.name] = '%.1f' % row[1] d["dG'0 (%s)" % thermo2.name] = '%.1f' % row[2] d['reaction'] = row[3].to_hypertext(show_cids=False) if row[3].rid is not None: d['rid'] = '<a href="%s">R%05d</a>' % (row[3].get_link(), row[3].rid) else: d['rid'] = '' d['pH'] = '%.1f' % row[4] d['pMg'] = '%.1f' % row[5] d['I'] = '%.2f' % row[6] d['T'] = '%.1f' % row[7] d['eval.'] = row[8] if row[9]: d['url'] = '<a href="%s">link</a>' % row[9] else: d['url'] = '' dict_list.append(d) html_writer.write_table(dict_list, table_headers)
data = np.loadtxt(DATA_FNAME, dtype='float', delimiter=',') #plt.plot(data[:, 0], data[:, 1], '.') feist_idx = set(np.nonzero(np.isfinite(data[:, 1]))[0].flat) ugcm_idx = set(np.nonzero(np.isfinite(data[:, 2]))[0].flat) nist_idx = set(np.nonzero(np.isfinite(data[:, 3]))[0].flat) comp_idx = list(feist_idx.intersection(ugcm_idx).intersection(nist_idx)) minG, maxG = (np.min(data[comp_idx, 0]), np.max(data[comp_idx, 0])) plt.figure(figsize=(10, 5), dpi=90) plt.subplot(1, 2, 1) err_feist_nist = data[comp_idx, 1] - data[comp_idx, 3] rms_feist_nist = rms_flat(err_feist_nist) plt.plot(data[comp_idx, 1], data[comp_idx, 3], '.g') plt.plot([minG, maxG], [minG, maxG], ':k') plt.ylabel('TECRDB observation [kJ/mol]') plt.xlabel('value in iAF1260 [kJ/mol]') plt.title('N = %d, RMSE = %.1f [kJ/mol]' % (len(comp_idx), rms_feist_nist)) plt.subplot(1, 2, 2) err_ugcm_nist = data[comp_idx, 2] - data[comp_idx, 3] rms_ugcm_nist = rms_flat(err_ugcm_nist) plt.plot(data[comp_idx, 2], data[comp_idx, 3], '.g') plt.plot([minG, maxG], [minG, maxG], ':k') plt.ylabel('TECRDB observation [kJ/mol]') plt.xlabel('UGCM estimation [kJ/mol]') plt.title('N = %d, RMSE = %.1f [kJ/mol]' % (len(comp_idx), rms_ugcm_nist)) plt.tight_layout()
def Loo(self, no_anchoring=True): n = self.S.shape[1] dG0_r_ugc = np.matrix(np.zeros((3, n))) * np.nan dG0_r_pgc = np.matrix(np.zeros((1, n))) * np.nan rowdicts = [] class2ugc_err = defaultdict(list) class2pgc_err = defaultdict(list) for i in xrange(n): if self.obs_types[i] != 'reaction': continue if self.anchored[0, i]: continue if abs(self.S[:, i]).sum(0) < self.epsilon: # empty reaction continue no_i = range(0, i) + range(i + 1, n) obs_S = self.S[:, no_i].copy() obs_anchored = self.anchored[0, no_i] if no_anchoring: obs_anchored = obs_anchored * 0 obs_b = self.b[:, no_i].copy() est_S = self.S[:, i].copy() dG0_r_ugc[:, i], parts, dG0_r_pgc[ 0, i] = self._GetChemicalReactionEnergies(obs_S, self.cids, obs_b, obs_anchored, est_S, self.cids) if parts[3, 0] > self.epsilon: classification = 'kernel' elif parts[1, 0] > self.epsilon and parts[2, 0] > self.epsilon: classification = 'PRC + PGC' elif parts[1, 0] > self.epsilon: classification = 'PRC' elif parts[2, 0] > self.epsilon: classification = 'PGC' else: classification = 'anchored' est_b = float(dG0_r_ugc[:, i].sum(0)) ugc_err = self.b[0, i] - est_b class2ugc_err[classification].append(ugc_err) pgc_err = self.b[0, i] - dG0_r_pgc[0, i] class2pgc_err[classification].append(pgc_err) rowdict = {} rowdict['row'] = i rowdict['type'] = self.obs_types[i] rowdict['reaction'] = UnifiedGroupContribution.row2hypertext( self.S[:, i], self.cids) rowdict['obs'] = self.b[0, i] rowdict['est'] = est_b rowdict['est(PGC)'] = dG0_r_pgc[0, i] if np.isfinite(ugc_err): rowdict['|err|'] = abs(ugc_err) else: rowdict['|err|'] = 0 rowdict['est_ANCH'] = dG0_r_ugc[0, i] rowdict['est_PRC'] = dG0_r_ugc[1, i] rowdict['est_PGC'] = dG0_r_ugc[2, i] rowdict['part_ANCH'] = parts[0, 0] rowdict['part_PRC'] = parts[1, 0] rowdict['part_PGC'] = parts[2, 0] rowdict['part_NULL'] = parts[3, 0] rowdict['class'] = classification rowdicts.append(rowdict) class_errors = [] for classification in class2ugc_err.keys(): ugc_err_list = class2ugc_err[classification] pgc_err_list = class2pgc_err[classification] class_errors.append( '%s: N = %d, rmse(UGC) = %.1f kJ/mol, rmse(PGC) = %.1f kJ/mol' % (classification, len(ugc_err_list), rms_flat(ugc_err_list), rms_flat(pgc_err_list))) self.Report(dG0_r_ugc.sum(0), 'UGC - Leave one out') self.Report(dG0_r_pgc, 'PGC - Leave one out') rowdicts.sort(key=lambda x: x['|err|'], reverse=True) self.html_writer.write( '<h2>Linear Regression Leave-One-Out Analysis</h2>\n') self.html_writer.insert_toggle(start_here=True, label="Show table") self.html_writer.write_ul(class_errors) self.html_writer.write_table(rowdicts, headers=[ 'row', 'type', 'reaction', 'class', 'obs', 'est', 'est(PGC)', '|err|', 'est_ANCH', 'est_PRC', 'est_PGC', 'part_ANCH', 'part_PRC', 'part_PGC', 'part_NULL' ], decimal=1) self.html_writer.div_end()
def main(argv): filename = '' if len(sys.argv) < 3: print('Usage: rotation_spacing.py -f <filename>') sys.exit() try: opts, args = getopt.getopt(argv,"hf:",["file="]) except getopt.GetoptError: print('rotation_spacing.py -f <filename>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('Usage: rotation_spacing.py -f <filename>') sys.exit() elif opt in ("-f", "--file"): filename = arg try: # More accurate peak finding from # https://gist.github.com/endolith/255291#file-parabolic-py from parabolic import parabolic def argmax(x): return parabolic(x, numpy.argmax(x))[0] except ImportError: from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) I = I - mean(I) # Demean; make the brightness extend above and below zero #plt.subplot(2, 2, 1) #plt.imshow(I) # Do the radon transform and display the result sinogram = radon(I) #plt.subplot(2, 2, 2) #plt.imshow(sinogram.T, aspect='auto') #plt.gray() # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = argmax(r) #print('Rotation: {:.2f} degrees'.format(90 - rotation)) ''' rotation = 90 - rotation rotation = -rotation print('{:.2f}'.format(rotation)) ''' print('{:.2f}'.format(-(90-rotation))) #plt.axhline(rotaotion, color='r') # Plot the busy row row = sinogram[:, rotation] N = len(row) #plt.subplot(2, 2, 3) #plt.plot(row) # Take spectrum of busy row and find line spacing window = blackman(N) spectrum = rfft(row * window) #plt.plot(row * window) frequency = argmax(abs(spectrum)) line_spacing = N / frequency # pixels
def verify_results(self, key, thermodynamics, html_writer): """Calculate all the dG0_r for the reaction from NIST and compare to the measured data. Write results to HTML. Args: key: The name of this group of results. thermodynamics: a Thermodynamics object that provides dG estimates. html_writer: to write HTML. ignore_I: whether or not to ignore the ionic strength in NIST. """ logging.info( "calculate the correlation between %s's predictions and the NIST database" % key) known_cid_set = thermodynamics.get_all_cids() dG0_obs_vec = [] dG0_est_vec = [] # A mapping from each evaluation method (NIST calls separates them to # A, B, C and D) to the results of the relevant measurements evaluation_map = {} total_list = [] cid2count = {} for row_data in self.data: for cid in row_data.GetAllCids(): cid2count[cid] = cid2count.setdefault(cid, 0) + 1 for row_data in self.data: unknown_set = set(row_data.GetAllCids()).difference(known_cid_set) if unknown_set: logging.debug("a compound in (%s) doesn't have a dG0_f" % row_data.origin) continue #label = row_data.evaluation label = row_data.K_type if label not in evaluation_map: evaluation_map[label] = ([], []) try: dG0_pred = row_data.PredictReactionEnergy(thermodynamics) except MissingCompoundFormationEnergy: logging.debug("a compound in (%s) doesn't have a dG0_f" % row_data.origin) continue dG0_obs_vec.append(row_data.dG0_r) dG0_est_vec.append(dG0_pred) evaluation_map[label][0].append(row_data.dG0_r) evaluation_map[label][1].append(dG0_pred) n_measurements = min( [cid2count[cid] for cid in row_data.GetAllCids()]) error = abs(row_data.dG0_r - dG0_pred) total_list.append([ error, row_data.dG0_r, dG0_pred, row_data.sparse, row_data.pH, row_data.pMg, row_data.I, row_data.T, row_data.evaluation, n_measurements ]) # plot the profile graph rcParams['text.usetex'] = False rcParams['legend.fontsize'] = 12 rcParams['font.family'] = 'sans-serif' rcParams['font.size'] = 16 rcParams['lines.linewidth'] = 2 rcParams['lines.markersize'] = 3 rcParams['figure.figsize'] = [8.0, 6.0] rcParams['figure.dpi'] = 100 fig1 = figure() hold(True) colors = ['purple', 'orange', 'lightgreen', 'red', 'cyan'] for e in sorted(evaluation_map.keys()): measured, predicted = evaluation_map[e] resid = np.array(measured) - np.array(predicted) label = '%s (N = %d, RMSE = %.2f [kJ/mol])' % ( e, len(measured), rms_flat(resid.flat)) c = colors.pop(0) plot(measured, predicted, marker='.', linestyle='None', markerfacecolor=c, markeredgecolor=c, markersize=5, label=label) legend(loc='upper left') resid = np.array(dG0_obs_vec) - np.array(dG0_est_vec) rmse = rms_flat(resid.flat) title(r'N = %d, RMSE = %.1f [kJ/mol]' % (len(dG0_obs_vec), rmse), fontsize=14) xlabel(r'$\Delta_{obs} G^\circ$ [kJ/mol]', fontsize=14) ylabel(r'$\Delta_{est} G^\circ$ [kJ/mol]', fontsize=14) min_x = min(dG0_obs_vec) max_x = max(dG0_obs_vec) plot([min_x, max_x], [min_x, max_x], 'k--') axis([-60, 60, -60, 60]) fig2 = figure() hist([(row[1] - row[2]) for row in total_list], bins=arange(-50, 50, 0.5)) title(r'RMSE = %.1f [kJ/mol]' % rmse, fontsize=14) xlabel(r'$\Delta_{obs} G^\circ - \Delta_{est} G^\circ$ [kJ/mol]', fontsize=14) ylabel(r'no. of measurements', fontsize=14) fig3 = figure() plot([row[9] for row in total_list], [abs(row[1] - row[2]) for row in total_list], '.') title( r'The effect of the number of measurements on the estimation error' % rmse, fontsize=14) xlabel(r'minimum no. of measurements among reaction compounds', fontsize=14) ylabel(r'$|| \Delta_{obs} G^\circ - \Delta_{est} G^\circ ||$ [kJ/mol]', fontsize=14) xscale('log') html_writer.write("<h2>%s</h2>" % key) html_writer.embed_matplotlib_figure(fig1, width=400, height=300) html_writer.embed_matplotlib_figure(fig2, width=400, height=300) html_writer.write( '<input type="button" class="button" onclick="return toggleMe(\'%s\')" value="Show">\n' % (key)) html_writer.write('<div id="%s" style="display:none">' % key) html_writer.embed_matplotlib_figure(fig3, width=400, height=300) table_headers = [ "|error|", "dG0(obs)", "dG0(pred)", "reaction", "pH", "pMg", "I", "T", "evaluation", "min_num_measurements" ] html_writer.write("<table>\n") html_writer.write("<tr><td>" + "</td><td>".join(table_headers) + "</td></tr>\n") for row in sorted(total_list, reverse=True): sparse_reaction = row[3] row[3] = self.kegg.sparse_to_hypertext(sparse_reaction, show_cids=False) html_writer.write("<tr><td>" + "</td><td>".join(["%.1f" % x for x in row[:3]] + [str(x) for x in row[3:]]) + "</td></tr>\n") html_writer.write("</table>\n") html_writer.write("</div><br>\n")
def Loo(self, no_anchoring=True): n = self.S.shape[1] dG0_r_ugc = np.matrix(np.zeros((3, n))) * np.nan dG0_r_pgc = np.matrix(np.zeros((1, n))) * np.nan rowdicts = [] class2ugc_err = defaultdict(list) class2pgc_err = defaultdict(list) for i in xrange(n): if self.obs_types[i] != 'reaction': continue if self.anchored[0, i]: continue if abs(self.S[:, i]).sum(0) < self.epsilon: # empty reaction continue no_i = range(0, i) + range(i+1, n) obs_S = self.S[:, no_i].copy() obs_anchored = self.anchored[0, no_i] if no_anchoring: obs_anchored = obs_anchored * 0; obs_b = self.b[:, no_i].copy() est_S = self.S[:, i].copy() dG0_r_ugc[:, i], parts, dG0_r_pgc[0, i] = self._GetChemicalReactionEnergies( obs_S, self.cids, obs_b, obs_anchored, est_S, self.cids) if parts[3, 0] > self.epsilon: classification = 'kernel' elif parts[1, 0] > self.epsilon and parts[2, 0] > self.epsilon: classification = 'PRC + PGC' elif parts[1, 0] > self.epsilon: classification = 'PRC' elif parts[2, 0] > self.epsilon: classification = 'PGC' else: classification = 'anchored' est_b = float(dG0_r_ugc[:, i].sum(0)) ugc_err = self.b[0, i] - est_b class2ugc_err[classification].append(ugc_err) pgc_err = self.b[0, i] - dG0_r_pgc[0, i] class2pgc_err[classification].append(pgc_err) rowdict = {} rowdict['row'] = i rowdict['type'] = self.obs_types[i] rowdict['reaction'] = UnifiedGroupContribution.row2hypertext(self.S[:, i], self.cids) rowdict['obs'] = self.b[0, i] rowdict['est'] = est_b rowdict['est(PGC)'] = dG0_r_pgc[0, i] if np.isfinite(ugc_err): rowdict['|err|'] = abs(ugc_err) else: rowdict['|err|'] = 0 rowdict['est_ANCH'] = dG0_r_ugc[0, i] rowdict['est_PRC'] = dG0_r_ugc[1, i] rowdict['est_PGC'] = dG0_r_ugc[2, i] rowdict['part_ANCH'] = parts[0, 0] rowdict['part_PRC'] = parts[1, 0] rowdict['part_PGC'] = parts[2, 0] rowdict['part_NULL'] = parts[3, 0] rowdict['class'] = classification rowdicts.append(rowdict) class_errors = [] for classification in class2ugc_err.keys(): ugc_err_list = class2ugc_err[classification] pgc_err_list = class2pgc_err[classification] class_errors.append('%s: N = %d, rmse(UGC) = %.1f kJ/mol, rmse(PGC) = %.1f kJ/mol' % (classification, len(ugc_err_list), rms_flat(ugc_err_list), rms_flat(pgc_err_list))) self.Report(dG0_r_ugc.sum(0), 'UGC - Leave one out') self.Report(dG0_r_pgc, 'PGC - Leave one out') rowdicts.sort(key=lambda x:x['|err|'], reverse=True) self.html_writer.write('<h2>Linear Regression Leave-One-Out Analysis</h2>\n') self.html_writer.insert_toggle(start_here=True, label="Show table") self.html_writer.write_ul(class_errors) self.html_writer.write_table(rowdicts, headers=['row', 'type', 'reaction', 'class', 'obs', 'est', 'est(PGC)', '|err|', 'est_ANCH', 'est_PRC', 'est_PGC', 'part_ANCH', 'part_PRC', 'part_PGC', 'part_NULL'], decimal=1) self.html_writer.div_end()
def AnalyzeTrainingSet(self, skip_formations=True): n_obs = self.group_matrix.shape[1] rowdicts = [] fit_results = np.dot(self.group_contributions, self.group_matrix) residuals = fit_results - self.obs_values if self.transformed: sym = symbol_d_G0_prime else: sym = symbol_d_G0 for i in xrange(n_obs): if self.obs_types[i] in [ KeggObservation.TYPE_ACID_BASE, KeggObservation.TYPE_MG, KeggObservation.TYPE_REDOX ]: continue if skip_formations and self.obs_types[ i] == KeggObservation.TYPE_FORMATION: continue rowdict = {'Observation': self.obs_ids[i]} rowdict[sym + ' (obs)'] = self.obs_values[0, i] rowdict[sym + ' (fit)'] = fit_results[0, i] rowdict[sym + ' (res)'] = residuals[0, i] rowdict['LOO ' + sym + ' (fit)'] = np.nan rowdict['LOO ' + sym + ' (res)'] = np.nan rowdict['sortkey'] = 0 rowdicts.append(rowdict) logging.info('Fit Error = %.1f' % residuals[0, i]) # leave out the row corresponding with observation 'i' logging.info('Cross validation, leaving-one-out: ' + self.obs_ids[i]) subset = range(n_obs) subset.pop(i) loo_group_contributions, loo_nullspace = LinearRegression.LeastSquares( self.group_matrix[:, subset], self.obs_values[:, subset]) if loo_nullspace.shape[1] > self.group_nullspace.shape[1]: logging.warning( 'example %d is not linearly dependent in the other examples' % i) continue rowdict['LOO ' + sym + ' (fit)'] = float( np.dot(loo_group_contributions, self.group_matrix[:, i])) rowdict['LOO ' + sym + ' (res)'] = \ rowdict['LOO ' + sym + ' (fit)'] - self.obs_values[0, i] rowdict['sortkey'] = abs(rowdict['LOO ' + sym + ' (res)']) logging.info('LOO Error = %.1f' % rowdict['LOO ' + sym + ' (res)']) logging.info( "writing the table of estimation errors for each compound") self.html_writer.write('</br><b>Cross validation table</b>') self.html_writer.insert_toggle(start_here=True) self.html_writer.write('<font size="1">\n') obs_vec = np.matrix([row[sym + ' (obs)'] for row in rowdicts]) resid_vec = np.matrix([row[sym + ' (res)'] for row in rowdicts]) rmse = rms_flat(resid_vec.flat) loo_resid_vec = np.matrix( [row['LOO ' + sym + ' (res)'] for row in rowdicts]) loo_rmse = rms_flat(loo_resid_vec[np.isfinite(loo_resid_vec)].flat) self.html_writer.write_ul([ 'fit RMSE = %.1f [kJ/mol]' % rmse, 'leave-one-out RMSE = %.1f [kJ/mol]' % loo_rmse ]) logging.info("Goodness of fit: RMSE = %.1f [kJ/mol]" % rmse) logging.info("Leave-one-out test: RMSE = %.1f [kJ/mol]" % loo_rmse) headers = [ 'Observation', sym + ' (obs)', sym + ' (fit)', sym + ' (res)', 'LOO ' + sym + ' (fit)', 'LOO ' + sym + ' (res)' ] rowdicts.sort(key=lambda (x): x['sortkey'], reverse=True) self.html_writer.write_table(rowdicts, headers, decimal=1) self.html_writer.write('</font>\n') self.html_writer.div_end() self.html_writer.write('</br><b>Cross-validation figure</b>') self.html_writer.insert_toggle(start_here=True) obs_vs_err_fig = plt.figure(figsize=[6.0, 6.0], dpi=100) plt.plot(obs_vec.T, resid_vec.T, '.') plt.xlabel('Observation') plt.ylabel('Estimated (PGC) Residuals') plt.hold(True) for row in rowdicts: if abs(row[sym + ' (res)']) > 2 * rmse: plt.text(row[sym + ' (obs)'], row[sym + ' (res)'], row['Observation'], fontsize=4, figure=obs_vs_err_fig) plt.title('Observed vs. Fitted (PGC) Residuals', figure=obs_vs_err_fig) self.html_writer.embed_matplotlib_figure(obs_vs_err_fig) self.html_writer.div_end()
def verify_results(self, html_writer, thermodynamics, name=None): """Calculate all the dG0_r for the reaction from NIST and compare to the measured data. Write results to HTML. Args: thermodynamics: a Thermodynamics object that provides dG estimates. ignore_I: whether or not to ignore the ionic strength in NIST. """ dG0_obs_vec = [] dG0_est_vec = [] # A mapping from each evaluation method (NIST calls separates them to # A, B, C and D) to the results of the relevant measurements evaluation_map = {} rowdicts = [] finite_rowdicts = [] eval_to_label = {'A':'high quality', 'B':'low quality', 'C':'low quality', 'D':'low quality', 'E':'low quality'} for row_data in self.SelectRowsFromNist(): rowdict = {} label = eval_to_label[row_data.evaluation] if label not in evaluation_map: evaluation_map[label] = ([], []) rowdict[symbol_dr_G0_prime + ' (obs)'] = np.round(row_data.dG0_r, 1) rowdict['_reaction'] = row_data.reaction rowdict['reaction'] = row_data.reaction.to_hypertext(show_cids=False) if row_data.reaction.rid is not None: rowdict['rid'] = '<a href="%s">R%05d</a>' % (row_data.reaction.get_link(), row_data.reaction.rid) else: rowdict['rid'] = '' rowdict['pH'] = row_data.pH rowdict['pMg'] = row_data.pMg rowdict['I'] = row_data.I rowdict['T'] = row_data.T rowdict['eval.'] = row_data.evaluation rowdict['url'] = '<a href="%s">%s</a>' % (row_data.url, row_data.ref_id) dG0_est = row_data.PredictReactionEnergy(thermodynamics) if np.isfinite(dG0_est): dG0_obs_vec.append(row_data.dG0_r) dG0_est_vec.append(dG0_est) evaluation_map[label][0].append(row_data.dG0_r) evaluation_map[label][1].append(dG0_est) rowdict[symbol_dr_G0_prime + ' (est)'] = np.round(dG0_est, 1) rowdict['residual'] = np.round(row_data.dG0_r - dG0_est, 3) rowdict['|error|'] = abs(rowdict['residual']) rowdict['sort_key'] = -rowdict['|error|'] finite_rowdicts.append(rowdict) else: rowdict['sort_key'] = 1 rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['sort_key']) if not dG0_obs_vec: return 0, 0 unique_reaction_dict = defaultdict(list) for rowdict in finite_rowdicts: unique_reaction_dict[rowdict['_reaction']].append(rowdict['|error|']) unique_rmse_list = [rms_flat(error_list) for error_list in unique_reaction_dict.values()] unique_rmse = rms_flat(unique_rmse_list) resid = np.array(dG0_obs_vec) - np.array(dG0_est_vec) rmse = rms_flat(resid.flat) # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['legend.fontsize'] = 10 plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 12 plt.rcParams['lines.linewidth'] = 1 plt.rcParams['lines.markersize'] = 3 fig1 = plt.figure(figsize=(6,6), dpi=90) plt.hold(True) colors = ['purple', 'orange'] for i, label in enumerate(sorted(evaluation_map.keys())): measured, predicted = evaluation_map[label] plt.plot(measured, predicted, marker='.', linestyle='None', markerfacecolor=colors[i], markeredgecolor=colors[i], markersize=5, label=label, figure=fig1) plt.legend(loc='lower right') plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (unique_rmse), fontsize=14, figure=fig1) plt.xlabel(r'observed $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig1) plt.ylabel(r'estimated $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig1) #min_x = min(dG0_obs_vec) #max_x = max(dG0_obs_vec) plt.plot([-60, 60], [-60, 60], 'k--', figure=fig1) plt.axis([-60, 60, -60, 60]) if name: html_writer.embed_matplotlib_figure(fig1, name=name+"_eval") else: html_writer.embed_matplotlib_figure(fig1) fig2 = plt.figure(figsize=(6,6), dpi=90) binned_plot(x=[rowdict['pH'] for rowdict in finite_rowdicts], y=[rowdict['|error|'] for rowdict in finite_rowdicts], bins=[5,6,7,8,9], y_type='rmse', figure=fig2) plt.xlim((4, 11)) plt.ylim((0, 12)) plt.title(r'effect of pH', fontsize=14, figure=fig2) plt.xlabel('pH', fontsize=14, figure=fig2) plt.ylabel(r'RMSE ($\Delta_r G^{\'\circ}$) [kJ/mol]', fontsize=14, figure=fig2) if name: html_writer.embed_matplotlib_figure(fig2, name=name+"_pH") else: html_writer.embed_matplotlib_figure(fig2) fig3 = plt.figure(figsize=(6,6), dpi=90) plt.hist([rowdict['residual'] for rowdict in finite_rowdicts], bins=np.arange(-50, 50, 0.5)) plt.title(r'RMSE = %.1f [kJ/mol]' % rmse, fontsize=14, figure=fig3) plt.xlabel(r'residual $\Delta_r G^{\'\circ}$ [kJ/mol]', fontsize=14, figure=fig3) plt.ylabel(r'no. of measurements', fontsize=14, figure=fig3) if name: html_writer.embed_matplotlib_figure(fig3, name=name+"_hist") else: html_writer.embed_matplotlib_figure(fig3) table_headers = ["#", "|error|", symbol_dr_G0_prime + " (obs)", symbol_dr_G0_prime + " (est)", "reaction", "rid", "pH", "pMg", "I", "T", "eval.", "url"] html_writer.write_table(rowdicts, table_headers, decimal=1) return len(dG0_obs_vec), unique_rmse
def show_rotation_spacing(filename): """ Automatically detect rotation and line spacing of an image of text using Radon transform If image is rotated by the inverse of the output, the lines will be horizontal (though they may be upside-down depending on the original image) It doesn't work with black borders """ # from __future__ import division, print_function from skimage.transform import radon from PIL import Image from numpy import asarray, mean, array, blackman import numpy from numpy.fft import rfft import matplotlib.pyplot as plt from matplotlib.mlab import rms_flat try: # More accurate peak finding from # https://gist.github.com/endolith/255291#file-parabolic-py from parabolic import parabolic def argmax(x): return parabolic(x, numpy.argmax(x))[0] except ImportError: from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) I = I - mean(I) # Demean; make the brightness extend above and below zero plt.subplot(2, 2, 1) plt.imshow(I) # Do the radon transform and display the result sinogram = radon(I) plt.subplot(2, 2, 2) plt.imshow(sinogram.T, aspect='auto') plt.gray() # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = argmax(r) print('Rotation: {:.2f} degrees'.format(90 - rotation)) plt.axhline(rotation, color='r') # Plot the busy row row = sinogram[:, rotation] N = len(row) plt.subplot(2, 2, 3) plt.plot(row) # Take spectrum of busy row and find line spacing window = blackman(N) spectrum = rfft(row * window) plt.plot(row * window) frequency = argmax(abs(spectrum)) line_spacing = N / frequency # pixels print('Line spacing: {:.2f} pixels'.format(line_spacing)) plt.subplot(2, 2, 4) plt.plot(abs(spectrum)) plt.axvline(frequency, color='r') plt.yscale('log') plt.show()
def two_way_comparison(self, html_writer, thermo1, thermo2, name=None): """ Compare the estimation errors of two different evaluation methods. Write results to HTML. Args: thermo1: a Thermodynamics object that provides dG estimates. thermo2: a Thermodynamics object that provides dG estimates. """ total_list = [] for row_data in self.SelectRowsFromNist(): try: dG0_pred1 = row_data.PredictReactionEnergy(thermo1) dG0_pred2 = row_data.PredictReactionEnergy(thermo2) except MissingReactionEnergy as e: logging.debug("the reaction in (%s) cannot be estimated: %s" % (row_data.ref_id, str(e))) continue total_list.append([row_data.dG0_r, dG0_pred1, dG0_pred2, row_data.reaction, row_data.pH, row_data.pMg, row_data.I, row_data.T, row_data.evaluation, row_data.url]) if not total_list: return 0, 0 # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 8 plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 2 plt.rcParams['figure.dpi'] = 100 data_mat = np.array(total_list) fig1 = plt.figure(figsize=(4,4)) plt.hold(True) error1 = data_mat[:,0]-data_mat[:,1] error2 = data_mat[:,0]-data_mat[:,2] max_err = max(error1.max(), error2.max()) min_err = min(error1.min(), error2.min()) plt.plot([min_err, max_err], [min_err, max_err], 'k--', figure=fig1) plt.plot(error1, error2, '.', figure=fig1) plt.title("Error Comparison per Reaction (in kJ/mol)") plt.xlabel(thermo1.name, figure=fig1) plt.ylabel(thermo2.name, figure=fig1) html_writer.embed_matplotlib_figure(fig1, name=name+"_corr") fig2 = plt.figure(figsize=(7,3)) for i, thermo in enumerate([thermo1, thermo2]): fig2.add_subplot(1,2,i+1) plt.plot(data_mat[:,0], data_mat[:,i+1], 'b.') rmse = rms_flat((data_mat[:,0] - data_mat[:,i+1]).flat) plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (rmse)) plt.xlabel(r'observed $\Delta G_r^\circ$ from NIST [kJ/mol]') plt.ylabel(r'estimated $\Delta G_r^\circ$ using %s [kJ/mol]' % thermo.name) plt.plot([-60, 60], [-60, 60], 'k--') plt.axis([-60, 60, -60, 60]) html_writer.embed_matplotlib_figure(fig2, name=name+"_eval") table_headers = ["dG'0 (obs)", "dG'0 (%s)" % thermo1.name, "dG'0 (%s)" % thermo2.name, "reaction", "rid", "pH", "pMg", "I", "T", "eval.", "url"] dict_list = [] for row in sorted(total_list, key=lambda(x):abs(x[1]-x[2]), reverse=True): d = {} d["dG'0 (obs)"] = '%.1f' % row[0] d["dG'0 (%s)" % thermo1.name] = '%.1f' % row[1] d["dG'0 (%s)" % thermo2.name] = '%.1f' % row[2] d['reaction'] = row[3].to_hypertext(show_cids=False) if row[3].rid is not None: d['rid'] = '<a href="%s">R%05d</a>' % (row[3].get_link(), row[3].rid) else: d['rid'] = '' d['pH'] = '%.1f' % row[4] d['pMg'] = '%.1f' % row[5] d['I'] = '%.2f' % row[6] d['T'] = '%.1f' % row[7] d['eval.'] = row[8] if row[9]: d['url'] = '<a href="%s">link</a>' % row[9] else: d['url'] = '' dict_list.append(d) html_writer.write_table(dict_list, table_headers)
I = asarray(Image.open(filename).convert('L')) I = I - mean(I) # Demean; make the brightness extend above and below zero plt.subplot(2, 2, 1) plt.imshow(I) # Do the radon transform and display the result sinogram = radon(I) plt.subplot(2, 2, 2) plt.imshow(sinogram.T, aspect='auto') plt.gray() # Find the RMS value of each row and find "busiest" rotation, # where the transform is lined up perfectly with the alternating dark # text and white lines r = array([rms_flat(line) for line in sinogram.transpose()]) rotation = argmax(r) print('Rotation: {:.2f} degrees'.format(90 - rotation)) plt.axhline(rotation, color='r') # Plot the busy row row = sinogram[:, rotation] N = len(row) plt.subplot(2, 2, 3) plt.plot(row) # Take spectrum of busy row and find line spacing window = blackman(N) spectrum = rfft(row * window) plt.plot(row * window) frequency = argmax(abs(spectrum))
def main(): kegg = Kegg.getInstance() prefix = "../res/prc_" fixed_cids = {} # a dictionary from CID to pairs of (nH, dG0) # Alberty formation energies directly measured, linearly independent: fixed_cids[1] = (2, -237.19) # H2O fixed_cids[9] = (1, -1096.1) # HPO3(-2) fixed_cids[14] = (4, -79.31) # NH4(+1) fixed_cids[59] = (0, -744.53) # SO4(-2) fixed_cids[288] = (1, -586.77) # HCO3(-1) # Alberty zeros: fixed_cids[3] = (26, 0.0) # NAD(ox) fixed_cids[10] = (32, 0.0) # CoA fixed_cids[127] = (30, 0.0) # glutathione(ox) fixed_cids[376] = (28, 0.0) # retinal(ox) # Directly measured values fixed_cids[4] = (27, 22.65) # NAD(red) -- relative to NAD(ox) fixed_cids[212] = (13, -194.5) # adenosine # fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors' # Alberty zeros which are not in NIST: # fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox) # fixed_cids[16] = (31, 0.0) # FAD(ox) # fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox) # fixed_cids[61] = (19, 0.0) # FMN(ox) # fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox) # fixed_cids[399] = (90, 0.0) # ubiquinone(ox) public_db = SqliteDatabase("../data/public_data.sqlite") alberty = PsuedoisomerTableThermodynamics.FromDatabase( public_db, "alberty_pseudoisomers", label=None, name="Alberty" ) alberty_cid2dG0 = {} alberty_cid2nH = {} for cid in alberty.get_all_cids(): pmap = alberty.cid2PseudoisomerMap(cid) dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer( pH=default_pH, I=default_I, pMg=default_pMg, T=default_T ) alberty_cid2nH[cid] = nH alberty_cid2dG0[cid] = dG0 if not os.path.exists(prefix + "S.txt"): db = SqliteDatabase("../res/gibbs.sqlite") nist_regression = NistRegression(db) cid2nH = {} for cid in nist_regression.nist.GetAllCids(): if cid in fixed_cids: cid2nH[cid] = fixed_cids[cid][0] elif cid in alberty_cid2nH: cid2nH[cid] = alberty_cid2nH[cid] else: tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer( cid, pH=default_pH, I=default_I, pMg=default_pMg, T=default_T ) if tmp is not None: cid2nH[cid] = tmp[0] else: logging.warning( "The most abundant pseudoisomer of %s (C%05d) " "cannot be resolved. Using nH = 0." % (kegg.cid2name(cid), cid) ) cid2nH[cid] = 0 # nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction # nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40) S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH) # export the raw data matrices to text files C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids]) np.savetxt(prefix + "CID.txt", C, fmt="%d", delimiter=",") np.savetxt(prefix + "S.txt", S, fmt="%g", delimiter=",") np.savetxt(prefix + "dG0.txt", dG0, fmt="%.2f", delimiter=",") else: C = np.loadtxt(prefix + "CID.txt", delimiter=",") cids = [int(cid) for cid in C[:, 0]] cid2nH = {} for i, cid in enumerate(cids): cid2nH[cid] = int(C[i, 1]) S = np.loadtxt(prefix + "S.txt", delimiter=",") dG0 = np.loadtxt(prefix + "dG0.txt", delimiter=",") dG0 = np.reshape(dG0, (dG0.shape[0], 1)) html_writer = HtmlWriter("../res/regression_fast.html") html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n") html_writer.write("<p>The stoichiometric matrix (S):") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, S, cids) html_writer.div_end() html_writer.write("</p>") index2value = {} S_extended = S # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds for cid in fixed_cids.keys(): i = cids.index(cid) e_i = np.zeros((1, len(cids))) e_i[0, i] = 1.0 S_extended = np.vstack([S_extended, e_i]) nH, dG0_fixed = fixed_cids[cid] index2value[i] = dG0_fixed x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value) cid2dG0 = {} for i, cid in enumerate(cids): cid2dG0[cid] = x[i] # Calculate the Kernel of the reduced stoichiometric matrix (after removing # the columns of the fixed compounds). cids_red = [cid for cid in cids if cid not in fixed_cids] index_red = [i for i in xrange(len(cids)) if i not in index2value] S_red = S[:, index_red] K_red = LinearRegression.Kernel(S_red) # print "Reduced Stoichiometric Matrix:" # print matrix2string(S_red, cids_red, kegg) # print '-'*80 # Find all CIDs that are completely determined and do not depend on any # free variable. In other words, all zeros columns in K2. dict_list = [] determined_indices = np.where(np.sum(abs(K_red), 0) < 1e-10)[0] # all zero-columns in reducedK determined_cids = [cids_red[i] for i in determined_indices] plot_data = [] for i, cid in enumerate(cids): d = { "CID": "C%05d" % cid, "Compound": kegg.cid2name(cid), "nH": "%d" % cid2nH[cid], "dG0 (PRC)": "%.1f" % cid2dG0[cid], } if cid in alberty_cid2dG0: d["dG0 (Alberty)"] = "%.1f" % alberty_cid2dG0[cid] if cid not in fixed_cids: plot_data.append((alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid))) else: d["dG0 (Alberty)"] = "" if cid in fixed_cids: d["Depends on"] = "anchored" elif cid in determined_cids: d["Depends on"] = "fixed compounds" else: d["Depends on"] = "kernel dimensions" dict_list.append(d) dict_list.sort(key=lambda (x): (x["Depends on"], x["CID"])) html_writer.write("<p>Formation energies determined by the linear constraints:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table( dict_list, headers=["#", "Compound", "CID", "nH", "dG0 (PRC)", "dG0 (Alberty)", "Depends on"] ) html_writer.write("</font>") html_writer.div_end() html_writer.write("</p>") # Plot a comparison between PRC and Alberty formation energies fig = plt.figure(figsize=(8, 8), dpi=80) plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], "b.", figure=fig) for x, y, name in plot_data: plt.text(x, y, name, fontsize=6) plt.xlabel("Alberty $\Delta_f G^\circ$") plt.ylabel("PRC $\Delta_f G^\circ$") html_writer.write("<p>Plot comparing PRC and Alberty results:") html_writer.insert_toggle(start_here=True) html_writer.embed_matplotlib_figure(fig) html_writer.div_end() html_writer.write("</p>") K_sparse = SparseKernel(S_red).Solve() html_writer.write("<p>The sparse null-space of the reduced stoichiometric matrix:") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, K_sparse, cids_red) html_writer.div_end() html_writer.write("</p>") dict_list = [] index2string_html = dict((i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0])) index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0])) for i, cid in enumerate(cids_red): d = {} d["KEGG ID"] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid) d["KEGG ID plain"] = "C%05d" % cid d["Compound"] = kegg.cid2name(cid) d["nH"] = "%d" % cid2nH[cid] if cid in alberty_cid2dG0: d["dG0 (Alberty)"] = "%.1f" % alberty_cid2dG0[cid] else: d["dG0 (Alberty)"] = "" d["dG0 (PRC)"] = "%.1f" % cid2dG0[cid] d["dG0 (PRC) plain"] = "%.1f" % cid2dG0[cid] indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist() indic.reverse() d["order_key"] = indic if mlab.rms_flat(K_sparse[:, i]) > 1e-10: d["dG0 (PRC)"] += " + (" + vector2string(K_sparse[:, i], index2string_html) + ")" d["dG0 (PRC) plain"] += " + (" + vector2string(K_sparse[:, i], index2string) + ")" dict_list.append(d) dict_list.sort(key=lambda (d): (d["order_key"], d["KEGG ID plain"])) # Export the results to CSV csv_writer = csv.writer(open("../res/prc_results.csv", "w")) csv_writer.writerow(["KEGG ID", "Compound", "nH", "dG0 (PRC)", "dG0 (Alberty)"]) for d in dict_list: csv_writer.writerow([d["KEGG ID plain"], d["Compound"], d["nH"], d["dG0 (PRC) plain"], d["dG0 (Alberty)"]]) html_writer.write("<p>All formation energies as a function of the free variables:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=["#", "KEGG ID", "Compound", "nH", "dG0 (PRC)", "dG0 (Alberty)"]) html_writer.write("</font>") html_writer.div_end() html_writer.write("</p>") fp = open("../res/prc_latex.txt", "w") fp.write( latex.table2LaTeX( dict_list, headers=["#", "KEGG ID plain", "Compound", "nH", "dG0 (PRC) plain", "dG0 (Alberty)"] ) ) fp.close()
data = np.loadtxt(DATA_FNAME, dtype='float', delimiter=',') #plt.plot(data[:, 0], data[:, 1], '.') feist_idx = set(np.nonzero(np.isfinite(data[:, 1]))[0].flat) ugcm_idx = set(np.nonzero(np.isfinite(data[:, 2]))[0].flat) nist_idx = set(np.nonzero(np.isfinite(data[:, 3]))[0].flat) comp_idx = list(feist_idx.intersection(ugcm_idx).intersection(nist_idx)) minG, maxG = (np.min(data[comp_idx, 0]), np.max(data[comp_idx, 0])) plt.figure(figsize=(10, 5), dpi=90) plt.subplot(1,2,1) err_feist_nist = data[comp_idx, 1] - data[comp_idx, 3] rms_feist_nist = rms_flat(err_feist_nist) plt.plot(data[comp_idx, 1], data[comp_idx, 3], '.g') plt.plot([minG, maxG], [minG, maxG], ':k') plt.ylabel('TECRDB observation [kJ/mol]') plt.xlabel('value in iAF1260 [kJ/mol]') plt.title('N = %d, RMSE = %.1f [kJ/mol]' % (len(comp_idx), rms_feist_nist)) plt.subplot(1,2,2) err_ugcm_nist = data[comp_idx, 2] - data[comp_idx, 3] rms_ugcm_nist = rms_flat(err_ugcm_nist) plt.plot(data[comp_idx, 2], data[comp_idx, 3], '.g') plt.plot([minG, maxG], [minG, maxG], ':k') plt.ylabel('TECRDB observation [kJ/mol]') plt.xlabel('UGCM estimation [kJ/mol]') plt.title('N = %d, RMSE = %.1f [kJ/mol]' % (len(comp_idx), rms_ugcm_nist)) plt.tight_layout()
def two_way_comparison(html_writer, thermo1, thermo2, reaction_list, name=None): """ Compare the estimation errors of two different evaluation methods. Write results to HTML. Args: thermo1: a Thermodynamics object that provides dG estimates. thermo2: a Thermodynamics object that provides dG estimates. """ pH, pMg, I, T = (7, 14, 0.1, 298.15) total_list = [] for reaction in reaction_list: try: dG0_pred1 = reaction.PredictReactionEnergy(thermo1, pH=pH, pMg=pMg, I=I, T=T) dG0_pred2 = reaction.PredictReactionEnergy(thermo2, pH=pH, pMg=pMg, I=I, T=T) except MissingReactionEnergy: continue total_list.append([dG0_pred1, dG0_pred2, reaction]) if not total_list: return 0, 0 # plot the profile graph plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.size'] = 8 plt.rcParams['lines.linewidth'] = 2 plt.rcParams['lines.markersize'] = 2 plt.rcParams['figure.dpi'] = 100 data_mat = np.array([(x[0], x[1]) for x in total_list]) non_nan = list(np.isfinite(data_mat.sum(1)).nonzero()[0].flat) fig2 = plt.figure(figsize=(5, 5)) plt.plot(data_mat[non_nan, 0], data_mat[non_nan, 1], 'b.') rmse = rms_flat((data_mat[non_nan, 0] - data_mat[non_nan, 1]).flat) plt.text(-50, 40, r'RMSE = %.1f [kJ/mol]' % (rmse)) plt.xlabel(r'$\Delta G_r^\circ$ from %s [kJ/mol]' % thermo1.name) plt.ylabel(r'$\Delta G_r^\circ$ from %s [kJ/mol]' % thermo2.name) plt.plot([-200, 200], [-200, 200], 'k--') plt.axis([-200, 200, -200, 200]) html_writer.embed_matplotlib_figure(fig2, name=name + "_eval") table_headers = ["#", '|diff|', "dG'0 (%s)" % thermo1.name, "dG'0 (%s)" % thermo2.name,\ "reaction", "rid"] dict_list = [] for row in total_list: d = {} if np.isnan(row[0]) or np.isnan(row[1]): d["|diff|"] = 0 else: d["|diff|"] = abs(row[0] - row[1]) d["dG'0 (%s)" % thermo1.name] = row[0] d["dG'0 (%s)" % thermo2.name] = row[1] d['reaction'] = row[2].to_hypertext(show_cids=True) if row[2].rid is not None: d['rid'] = '<a href="%s">R%05d</a>' % (row[2].get_link(), row[2].rid) else: d['rid'] = '' dict_list.append(d) dict_list.sort(key=lambda d: d['|diff|'], reverse=True) html_writer.write_table(dict_list, table_headers, decimal=1)