def loadData(design): param_bounds, param_names, params_no, problem = setupProblem(design) samples, rows_to_keep = getSamples(design, params_no, param_bounds) df = pd.DataFrame({'mu0':samples[:,0],'sigma0':samples[:,1],\ 'mu1':samples[:,2],'sigma1':samples[:,3],\ 'p00':samples[:,4],'p11':samples[:,5]}) return df
def main(): if len(argv) != 8: print( 'Usage: FsPlusRpPlusGfs NormalFilePath TumorFilePath TargetDimensionalityFS TargetDimensionalityRP TargetDimensionalityGFS NumberOfNormalSamples NumberOfTumorSamples' ) exit() """Parameters for executing the script""" # File Path of Normal People's Data normalFilePath = argv[1] # File Path of Tumor People's Data tumorFilePath = argv[2] # The Dimensionality of Subspace after Feature Selection Stage targetDimensionalityFS = int(argv[3]) # The Dimensionality of Subspace after Random Projection Stage targetDimensionalityRP = int(argv[4]) # The Dimensionality of Subspace after Greedy Feature Selection Stage targetDimensionalityGFS = int(argv[5]) # Number of Samples for Normal People numberOfNormalTestingSamples = int(argv[6]) # Number of Samples for Tumor People numberOfTumorTestingSamples = int(argv[7]) """Read data from files""" # Sample of Normal People normalSamples = getSamples(normalFilePath) numberOfNormalSamples = len(normalSamples) # Samples of Tumor People tumorSamples = getSamples(tumorFilePath) numberOfTumorSamples = len(tumorSamples) # All Samples samples = normalSamples + tumorSamples numberOfSamples = len(samples) # Gene Indexes in List: samples normalSampleIndexes = range(0, numberOfNormalSamples) tumorSampleIndexes = range(numberOfNormalSamples, numberOfSamples) numberOfGenes = len(samples[0]) if len(samples) != 0 else 0 print('Original Data Matrix: {} Samples with {} Genes'.format( numberOfSamples, numberOfGenes)) """Data preprocessing""" zScoreNormalization(samples) """Runtime Result""" trainingErrorSamples, tp, fp, fn, tn = \ featureSelectionPlusRandomProjectionPlusGreedyFeatureSelectionProceduce( samples, normalSampleIndexes, tumorSampleIndexes, numberOfGenes, targetDimensionalityFS, targetDimensionalityRP, targetDimensionalityGFS, numberOfNormalTestingSamples, numberOfTumorTestingSamples)
def makeFigure8_ResponseSurfaces(): sns.set_style("white") # constants, vectors design = 'LHsamples_wider_1000_AnnQonly' structure = '53_ADC022' short_idx = np.arange(2, 22, 2) demand_idx = np.arange(1, 21, 2) percentiles = [50, 90] nrealizations = 10 nyears = 105 nmonths = 12 # plotting characteristics shortage_cmap = mpl.cm.get_cmap('RdBu_r') colors = [ '#de2d26', '#fb6a4a', '#3182bd', '#6baed6', '#a50f15', '#08519c', '#9e9ac8' ] # find which samples are still in param_bounds after flipping misidentified wet and dry states param_bounds, param_names, params_no, problem = setupProblem(design) samples, rows_to_keep = getSamples(design, params_no, param_bounds) nsamples = len(rows_to_keep) # load historical shortage data and convert acre-ft to m^3 hist_short = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 2] * 1233.48 hist_demand = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 1] * 1233.48 # replace failed runs with np.nan (currently -999.9) hist_short[hist_short < 0] = np.nan # load shortage data for this experimental design SYN = np.load('../Simulation_outputs/' + design + '/' + structure + '_info.npy') # extract columns for year shortage and demand and convert acre-ft to ^3 SYN_short = SYN[:, short_idx, :] * 1233.48 SYN_demand = SYN[:, demand_idx, :] * 1233.48 # use just the samples within the experimental design SYN_short = SYN_short[:, :, rows_to_keep] SYN_demand = SYN_demand[:, :, rows_to_keep] # replace failed runs with np.nan (currently -999.9) SYN_short[SYN_short < 0] = np.nan # Identify droughts at percentiles syn_magnitude = calc_syn_magnitude(nyears, nmonths, nrealizations, nsamples, percentiles, SYN_short) # reshape synthetic shortage data into 12*nyears x nsamples*nrealizations SYN_short = SYN_short.reshape([ np.shape(SYN_short)[0], np.shape(SYN_short)[1] * np.shape(SYN_short)[2] ]) SYN_demand = SYN_demand.reshape([ np.shape(SYN_demand)[0], np.shape(SYN_demand)[1] * np.shape(SYN_demand)[2] ]) # create data frames of shortage and SOWs CMIPsamples = np.loadtxt('../Qgen/CMIPunscaled_SOWs.txt')[:, 7:13] PaleoSamples = np.loadtxt('../Qgen/Paleo_SOWs.txt')[:, 7:13] CMIP = pd.DataFrame(data=np.repeat(CMIPsamples, nrealizations, axis=0), columns=param_names) Paleo = pd.DataFrame(data=np.repeat(PaleoSamples, nrealizations, axis=0), columns=param_names) dta = pd.DataFrame(data=np.repeat(samples, nrealizations, axis=0), columns=param_names) R2_scores = pd.read_csv('../Simulation_outputs/' + design + '/' + structure + '_R2.csv') fig, axes = plt.subplots(2, 3, figsize=(19.2, 9.5)) fig.subplots_adjust(hspace=0.3, wspace=0.3) # plot shortage distribution for this structure under all-encompassing experiment ax1 = axes[0, 0] handles, labels = plotSDC(ax1, SYN_short, SYN_demand, hist_short, hist_demand, nsamples, nrealizations) ax1.set_ylim([0, 6200000]) ax1.ticklabel_format(style='sci', axis='y', scilimits=(6, 6)) ax1.tick_params(axis='both', labelsize=14) ax1.set_ylabel('Shortage (m' + r'$^3$' + ')', fontsize=14) # add lines at percentiles for percentile in percentiles: ax1.plot([percentile, percentile], [0, 6200000], c='k') # plot variance decomposition for this structure under all-encompassing experiment ax2 = axes[1, 0] S1_values = pd.read_csv('../Simulation_outputs/' + design + '/' + structure + '_S1.csv') plotSums(S1_values, ax2, colors) ax2.set_ylim([0, 1]) ax2.tick_params(axis='both', labelsize=14) ax2.set_ylabel('Portion of Variance', fontsize=14) ax2.set_xlabel('Shortage Percentile', fontsize=14) # add lines at percentiles for percentile in percentiles: ax2.plot([percentile, percentile], [0, 1], c='k') for i in range(len(percentiles)): # get shortage magnitudes at this percentile dta['Shortage'] = syn_magnitude[i, :] # find average shortage across realizations in each SOW avg_dta = dta.groupby(['mu0', 'mu1', 'sigma0', 'sigma1', 'p00', 'p11'], as_index=False)[['Shortage']].mean() percentile_scores = R2_scores[str(int(percentiles[i] - 1))] if percentile_scores[0] > 0: # get top two predictors of shortage top_two = list(np.argsort(percentile_scores)[::-1][:2]) predictors = list( [param_names[top_two[0]], param_names[top_two[1]]]) avg_dta['Interaction'] = avg_dta[predictors[0]] * avg_dta[ predictors[1]] # fit OLS model with top two predictors and their interaction result = fitOLS_interact(avg_dta, predictors) xgrid = np.arange(param_bounds[top_two[0]][0], param_bounds[top_two[0]][1], \ np.around((param_bounds[top_two[0]][1]-param_bounds[top_two[0]][0])/100,decimals=4)) ygrid = np.arange(param_bounds[top_two[1]][0], param_bounds[top_two[1]][1], \ np.around((param_bounds[top_two[1]][1]-param_bounds[top_two[1]][0])/100,decimals=4)) # plot average shortage in each SOW and prediction from regression plotResponseSurface(axes[0,i+1], result, avg_dta, CMIP, Paleo, shortage_cmap, shortage_cmap, \ xgrid, ygrid, predictors[0], predictors[1], otherSOWs = False) axes[0, i + 1].set_title(str(percentiles[i]) + 'th Percentile', fontsize=16) fig.savefig('Figure8_ResponseSurfaces.pdf') # plot prediction from regression with CMIP and Paleo samples on top plotResponseSurface(axes[1,i+1], result, avg_dta, CMIP, Paleo, shortage_cmap, shortage_cmap, \ xgrid, ygrid, predictors[0], predictors[1], otherSOWs = True) fig.savefig('Figure8_ResponseSurfaces.pdf') fig.savefig('Figure8_ResponseSurfaces.pdf') fig.clf() return None
def makeFigureS18_FactorMaps_User3(): sns.set_style("white") # constants, vectors design = 'LHsamples_wider_1000_AnnQonly' structure = '3704614' short_idx = np.arange(2,22,2) demand_idx = np.arange(1,21,2) percentiles = [40, 90] nrealizations = 10 # plotting characteristics probability_cmap = mpl.cm.get_cmap('RdBu') success_cmap = mpl.colors.ListedColormap(np.array([[227,26,28],[166,206,227]])/255.0) contour_levels = np.arange(0.0, 1.05,0.1) # find which samples are still in param_bounds after flipping misidentified wet and dry states param_bounds, param_names, params_no, problem = setupProblem(design) samples, rows_to_keep = getSamples(design, params_no, param_bounds) nsamples = len(rows_to_keep) # load historical shortage data and convert acre-ft to m^3 hist_short = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:,2]*1233.48 hist_demand = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:,1]*1233.48 # replace failed runs with np.nan (currently -999.9) hist_short[hist_short < 0] = np.nan # load shortage data for this experimental design SYN = np.load('../Simulation_outputs/' + design + '/' + structure + '_info.npy') # extract columns for year shortage and demand and convert acre-ft to ^3 SYN_short = SYN[:,short_idx,:]*1233.48 SYN_demand = SYN[:,demand_idx,:]*1233.48 # use just the samples within the experimental design SYN_short = SYN_short[:,:,rows_to_keep] SYN_demand = SYN_demand[:,:,rows_to_keep] # replace failed runs with np.nan (currently -999.9) SYN_short[SYN_short < 0] = np.nan # reshape synthetic shortage data into 12*nyears x nsamples*nrealizations SYN_short = SYN_short.reshape([np.shape(SYN_short)[0],np.shape(SYN_short)[1]*np.shape(SYN_short)[2]]) SYN_demand = SYN_demand.reshape([np.shape(SYN_demand)[0],np.shape(SYN_demand)[1]*np.shape(SYN_demand)[2]]) # create data frames of shortage and SOWs dta = pd.DataFrame(data = np.repeat(samples, nrealizations, axis = 0), columns=param_names) fig, axes = plt.subplots(2,4,figsize=(24.3,9.1)) fig.subplots_adjust(hspace=0.5,right=0.8,wspace=0.5) # plot shortage distribution for this structure under all-encompassing experiment ax1 = axes[0,0] handles, labels = plotSDC(ax1, SYN_short, SYN_demand, hist_short, hist_demand, nsamples, nrealizations, True) ax1.set_ylim([0,1]) ax1.tick_params(axis='both',labelsize=14) ax1.set_ylabel('Shortage/Demand',fontsize=14) ax1.set_xlabel('Shortage Percentile',fontsize=14) # add lines at percentiles for percentile in percentiles: ax1.plot([percentile, percentile],[0,1],c='k') # plotfailure heatmap for this structure under all-encompassing experiment ax2 = axes[1,0] allSOWs, historic_percents, frequencies, magnitudes, gridcells, im = plotFailureHeatmap(ax2, design, structure) addPercentileBlocks(historic_percents, gridcells, percentiles, ax2) allSOWsperformance = allSOWs/100 historic_percents = [roundup(x) for x in historic_percents] #all_pseudo_r_scores = calcPseudoR2(frequencies, magnitudes, params_no, allSOWsperformance, dta, structure, design) all_pseudo_r_scores = pd.read_csv("../Simulation_outputs/" + design + "/" + structure + "_pseudo_r_scores.csv") for i in range(len(percentiles)): for j in range(3): # magnitude of shortage at this percentile to plot h = np.where(np.array(historic_percents) == 100 - percentiles[i])[0][0] if j == 0: h -= 2 elif j == 2: h += 2 # find out if each realization was a success or failure at this magnitude/frequency combination dta['Success'] = allSOWsperformance[list(frequencies).index(100-percentiles[i]),h,:] # consider each SOW a success if 50% or more realizations were a success avg_dta = dta.groupby(['mu0','mu1','sigma0','sigma1','p00','p11'],as_index=False)[['Success']].mean() avg_dta.loc[np.where(avg_dta['Success']>=0.5)[0],'Success'] = 1 avg_dta.loc[np.where(avg_dta['Success']<0.5)[0],'Success'] = 0 # load pseudo R2 of predictors for this magnitude/frequency combination pseudo_r_scores = all_pseudo_r_scores[str((100-percentiles[i]))+'yrs_'+str(magnitudes[h])+'prc'].values if pseudo_r_scores.any(): top_predictors = np.argsort(pseudo_r_scores)[::-1][:2] ranges = param_bounds[top_predictors] # define grid of x (1st predictor), and y (2nd predictor) dimensions # to plot contour map over xgrid = np.arange(param_bounds[top_predictors[0]][0], param_bounds[top_predictors[0]][1], np.around((ranges[0][1]-ranges[0][0])/100,decimals=4)) ygrid = np.arange(param_bounds[top_predictors[1]][0], param_bounds[top_predictors[1]][1], np.around((ranges[1][1]-ranges[1][0])/100,decimals=4)) all_predictors = [ dta.columns.tolist()[k] for k in top_predictors] # fit logistic regression model with top two predictors of success and their interaction avg_dta['Interaction'] = avg_dta[all_predictors[0]]*dta[all_predictors[1]] result = fitLogit_interact(avg_dta, [all_predictors[k] for k in [0,1]]) # plot success/failure for each SOW on top of logistic regression estimate of probability of success contourset = plotFactorMap(axes[i,j+1], result, avg_dta, probability_cmap, success_cmap, contour_levels, xgrid, ygrid, \ all_predictors[0], all_predictors[1]) axes[i,j+1].set_title("Success if " + str(magnitudes[h]) + "% shortage\n<" + str((100-percentiles[i])) + "% of the time", fontsize=16) fig.savefig('FigureS18_FactorMaps_User3.pdf') cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) cbar = fig.colorbar(contourset, cax=cbar_ax) cbar.ax.set_ylabel("Predicted Probability of Success", rotation=-90, va="bottom",fontsize=16) cbar.ax.tick_params(labelsize=16) fig.savefig("FigureS18_FactorMaps_User3.pdf") fig.clf() return None
def makeFigureS11_VarianceDecomposition_User3(): sns.set_style("white") designs = [ 'LHsamples_original_1000_AnnQonly', 'CMIPunscaled_SOWs', 'Paleo_SOWs', 'LHsamples_wider_1000_AnnQonly' ] nsamples = [1000, 97, 366, 1000] # before removing those out of bounds titles = ['Box Around Historical', 'CMIP', 'Paleo', 'All-Encompassing'] structure = '3704614' nrealizations = 10 short_idx = np.arange(2, 22, 2) demand_idx = np.arange(1, 21, 2) colors = [ "#de2d26", "#fb6a4a", "#3182bd", "#6baed6", "#a50f15", "#08519c", "#9e9ac8" ] mu0 = plt.Rectangle((0, 0), 1, 1, fc=colors[0], edgecolor='none') sigma0 = plt.Rectangle((0, 0), 1, 1, fc=colors[1], edgecolor='none') mu1 = plt.Rectangle((0, 0), 1, 1, fc=colors[2], edgecolor='none') sigma1 = plt.Rectangle((0, 0), 1, 1, fc=colors[3], edgecolor='none') p00 = plt.Rectangle((0, 0), 1, 1, fc=colors[4], edgecolor='none') p11 = plt.Rectangle((0, 0), 1, 1, fc=colors[5], edgecolor='none') Interact = plt.Rectangle((0, 0), 1, 1, fc=colors[6], edgecolor='none') # perform variance decomposition #for i, design in enumerate(designs): # Sobol_per_structure(design, structure) # plot shotrage distributions fig = plt.figure() count = 1 # subplot counter # load historical shortage and demand data and convert acre-ft to m^3 hist_short = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 2] * 1233.48 / 1E6 hist_demand = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 1] * 1233.48 / 1E6 # replace failed runs with np.nan (currently -999.9) hist_short[hist_short < 0] = np.nan for i, design in enumerate(designs): # find which samples are still in param_bounds after flipping misidentified wet and dry states param_bounds, param_names, params_no, problem = setupProblem(design) _, rows_to_keep = getSamples(design, params_no, param_bounds) nsamples[i] = len( rows_to_keep ) # after removing those out of bounds after reclassification # load shortage data for this experimental design SYN = np.load('../Simulation_outputs/' + design + '/' + structure + '_info.npy') # extract columns for year shortage and demand and convert acre-ft to m^3 SYN_short = SYN[:, short_idx, :] * 1233.48 / 1E6 SYN_demand = SYN[:, demand_idx, :] * 1233.48 / 1E6 # use just the samples within the experimental design SYN_short = SYN_short[:, :, rows_to_keep] SYN_demand = SYN_demand[:, :, rows_to_keep] # reshape into 12*nyears x nsamples*nrealizations SYN_short = SYN_short.reshape([ np.shape(SYN_short)[0], np.shape(SYN_short)[1] * np.shape(SYN_short)[2] ]) SYN_demand = SYN_demand.reshape([ np.shape(SYN_demand)[0], np.shape(SYN_demand)[1] * np.shape(SYN_demand)[2] ]) # replace failed runs with np.nan (currently -999.9) SYN_short[SYN_short < 0] = np.nan # plot shortage distribution ax = fig.add_subplot(2, 4, count) handles, labels = plotSDC(ax, SYN_short, SYN_demand, hist_short, hist_demand, nsamples[i], nrealizations) # only put labels left column, make y ranges consistent, title experiment if count == 1: ax.tick_params(axis='y', labelsize=14) ax.set_ylabel('Annual Shortage\n(millions of m' + r'$^3$' + ')', fontsize=16) else: ax.tick_params(axis='y', labelleft='off') ax.set_title(titles[count - 1], fontsize=16) ax.tick_params(axis='x', labelbottom='off') # iterature subplot counter count += 1 # plot variance decomposition for design in designs: # load sensitivity indices S1_values = pd.read_csv('../Simulation_outputs/' + design + '/' + structure + '_S1.csv') # plot shortage distribution ax = fig.add_subplot(2, 4, count) plotSums(S1_values, ax, colors) ax.tick_params(axis='x', labelsize=14) if count == 5: ax.tick_params(axis='y', labelsize=14) ax.set_ylabel('Portion of\nVariance Explained', fontsize=16) else: ax.tick_params(axis='y', labelleft='off') # iterate subplot counter count += 1 fig.set_size_inches([16, 8]) fig.subplots_adjust(bottom=0.22) fig.text(0.5, 0.15, 'Percentile of Shortage', ha='center', fontsize=16) fig.savefig('FigureS11_VarianceDecomposition_User3.pdf') fig.clf() return None
def makeFigure6_ShortageDistns(): sns.set_style("white") designs = [ 'LHsamples_original_1000_AnnQonly', 'CMIPunscaled_SOWs', 'Paleo_SOWs', 'LHsamples_wider_1000_AnnQonly' ] nsamples = [1000, 97, 366, 1000] # before removing those out of bounds titles = ['Box Around Historical', 'CMIP', 'Paleo', 'All-Encompassing'] structures = ['53_ADC022', '7200645'] nrealizations = 10 short_idx = np.arange(2, 22, 2) demand_idx = np.arange(1, 21, 2) fig = plt.figure() count = 1 # subplot counter for structure in structures: # load historical shortage and demand data and convert acre-ft to m^3 hist_short = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 2] * 1233.48 / 1E6 hist_demand = np.loadtxt('../Simulation_outputs/' + structure + '_info_hist.txt')[:, 1] * 1233.48 / 1E6 # replace failed runs with np.nan (currently -999.9) hist_short[hist_short < 0] = np.nan for i, design in enumerate(designs): # find which samples are still in param_bounds after flipping misidentified wet and dry states param_bounds, param_names, params_no, problem = setupProblem( design) _, rows_to_keep = getSamples(design, params_no, param_bounds) nsamples[i] = len( rows_to_keep ) # after removing those out of bounds after reclassification # load shortage data for this experimental design SYN = np.load('../Simulation_outputs/' + design + '/' + structure + '_info.npy') # extract columns for year shortage and demand and convert acre-ft to m^3 SYN_short = SYN[:, short_idx, :] * 1233.48 / 1E6 SYN_demand = SYN[:, demand_idx, :] * 1233.48 / 1E6 # use just the samples within the experimental design SYN_short = SYN_short[:, :, rows_to_keep] SYN_demand = SYN_demand[:, :, rows_to_keep] # reshape into 12*nyears x nsamples*nrealizations SYN_short = SYN_short.reshape([ np.shape(SYN_short)[0], np.shape(SYN_short)[1] * np.shape(SYN_short)[2] ]) SYN_demand = SYN_demand.reshape([ np.shape(SYN_demand)[0], np.shape(SYN_demand)[1] * np.shape(SYN_demand)[2] ]) # replace failed runs with np.nan (currently -999.9) SYN_short[SYN_short < 0] = np.nan # plot shortage distribution ax = fig.add_subplot(2, 4, count) handles, labels = plotSDC(ax, SYN_short, SYN_demand, hist_short, hist_demand, nsamples[i], nrealizations) # only put labels on bottom row/left column, make y ranges consistent, title experiment if count == 1 or count == 5: ax.tick_params(axis='y', labelsize=14) else: ax.tick_params(axis='y', labelleft='off') if count <= 4: ax.tick_params(axis='x', labelbottom='off') ax.set_title(titles[count - 1], fontsize=16) ax.set_ylim(0, 6.2) #ax.ticklabel_format(style='sci', axis='y', scilimits=(6,6)) else: ax.tick_params(axis='x', labelsize=14) ax.set_ylim(0, 370) #ax.ticklabel_format(style='sci', axis='y', scilimits=(8,8)) # iterature subplot counter count += 1 fig.set_size_inches([16, 8]) fig.text(0.5, 0.15, 'Percentile', ha='center', fontsize=16) fig.text(0.05, 0.5, 'Annual Shortage (millions of m' + r'$^3$' + ')', va='center', rotation=90, fontsize=16) fig.subplots_adjust(bottom=0.22) labels_transposed = [ labels[9], labels[4], labels[8], labels[3], labels[7], labels[2], labels[6], labels[1], labels[5], labels[0] ] handles_transposed = [ handles[9], handles[4], handles[8], handles[3], handles[7], handles[2], handles[6], handles[1], handles[5], handles[0] ] legend = fig.legend(handles=handles_transposed, labels=labels_transposed, fontsize=16, loc='lower center', title='Cumulative frequency in experiment', ncol=5) plt.setp(legend.get_title(), fontsize=16) fig.savefig('Figure6_ShortageDistns.pdf') fig.clf() return None