def optimize_trials( max_exponent, desired_count=3, thread_count=10, high_percentile=95, low_percentile=5, tolerance=0.01 ): queue = multiprocessing.Queue() start_time = time.time() for exponent in xrange(1, max_exponent + 1): trials = 10 ** exponent threads = [] for t in xrange(thread_count): thread = multiprocessing.Process( target=get_probability, kwargs={"trials": trials, "desired_heads": desired_count, "queue": queue} ) threads.append(thread) thread.start() for thread in threads: thread.join() data = [] while not queue.empty(): data.append(queue.get()) percentile_range = numpy.percentile(data, high_percentile) - numpy.percentile(data, low_percentile) if percentile_range <= tolerance: return DataTuple( probability=numpy.median(data), exponent=exponent, range=percentile_range, elapsed=time.time() - start_time, ) return DataTuple( probability=numpy.median(data), exponent=exponent, range=percentile_range, elapsed=time.time() - start_time )
def TL_from_sample(dat_sample, analysis = 'partition', out_folder = './out_files/'): """Obtain the empirical and simulated TL relationship given the output file from sample_var(). Here only the summary statistics are recorded for each study, instead of results from each individual sample, because the analysis can be quickly re-done given the input file, without going through the time-limiting step of generating samples from partitions. The input dat_sample is in the same format as defined by get_var_sample_file(). The output file has the following columns: study, empirical b, empirical intercept, empirical R-squared, empirical p-value, mean b, intercept, R-squared from samples, percentage of significant TL in samples (at alpha = 0.05), z-score between empirical and sample b, 2.5 and 97.5 percentile of sample b, z-score between empirical and sample intercept, 2.5 and 97.5 percentile of sample intercept. """ study_list = sorted(np.unique(dat_sample['study'])) for study in study_list: dat_study = dat_sample[dat_sample['study'] == study] emp_b, emp_inter, emp_r, emp_p, emp_std_err = stats.linregress(np.log(dat_study['mean']), np.log(dat_study['var'])) b_list = [] inter_list = [] psig = 0 R2_list = [] for i_sim in dat_sample.dtype.names[5:]: var_sim = dat_study[i_sim][dat_study[i_sim] > 0] # Omit samples of zero variance mean_list = dat_study['mean'][dat_study[i_sim] > 0] sim_b, sim_inter, sim_r, sim_p, sim_std_error = stats.linregress(np.log(mean_list), np.log(var_sim)) b_list.append(sim_b) inter_list.append(sim_inter) R2_list.append(sim_r ** 2) if sim_p < 0.05: psig += 1 psig /= len(dat_sample.dtype.names[5:]) out_file = open(out_folder + 'TL_form_' + analysis + '.txt', 'a') print>>out_file, study, emp_b, emp_inter, emp_r ** 2, emp_p, np.mean(b_list), np.mean(inter_list), np.mean(R2_list), \ psig, get_z_score(emp_b, b_list), np.percentile(b_list, 2.5), np.percentile(b_list, 97.5), get_z_score(emp_inter, inter_list), \ np.percentile(inter_list, 2.5), np.percentile(inter_list, 97.5) out_file.close()
def signmag_plot(a, b, z, ref): imdata1 = np.sign(ref) cmap1 = plt.cm.RdBu cmap1.set_bad('k', 1) imdata2 = np.log10(np.abs(ref)) cmap2 = plt.cm.YlOrRd cmap2.set_bad('k', 1) fig, axarr = plt.subplots(ncols=2, figsize=(12, 6)) axarr[0].pcolormesh(a, b, imdata1, cmap=cmap1, vmin=-1, vmax=1) im = axarr[1].pcolormesh(a, b, imdata2, cmap=cmap2, vmin=np.percentile(imdata2, 5), vmax=np.percentile(imdata2, 95)) for ax in axarr: ax.set_xlim((np.min(a), np.max(a))) ax.set_ylim((np.min(b), np.max(b))) ax.set_xlabel("a") ax.set_ylabel("b") ax.set(adjustable='box-forced', aspect='equal') fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.85, 0.15, 0.03, 0.7]) fig.colorbar(im, cax=cbar_ax) axarr[0].set_title("Sign of hyp1f1") axarr[1].set_title("Magnitude of hyp1f1") plt.suptitle("z = {:.2e}".format(np.float64(z))) return fig
def iqr(data): return ",".join( ( digits.format(numpy.percentile(data[column], 75) - numpy.percentile(data[column], 25)) for column in data.columns ) )
def main(argv): map_utilizations = [] reduce_utilizations = [] all_utilizations = [] dirname = argv[0] for filename in os.listdir(dirname): full_name = os.path.join(dirname, filename) if os.path.isfile(full_name) and filename.endswith("job_log"): print "Reading %s" % filename analyzer = parse_logs.Analyzer(full_name) for (id, stage) in analyzer.stages.iteritems(): for task in stage.tasks: for name, block_device_numbers in task.disk_utilization.iteritems(): if name in ["xvdb", "xvdf"]: effective_util = 0 if block_device_numbers[0] > 0: effective_util = (block_device_numbers[1] + block_device_numbers[2]) / block_device_numbers[0] all_utilizations.append(effective_util) if task.has_fetch: reduce_utilizations.append(effective_util) else: map_utilizations.append(effective_util) output_filename = os.path.join(dirname, "disk_utilization_cdf") f = open(output_filename, "w") for percent in range(100): f.write("%s\t%s\t%s\t%s\n" % (percent / 100., numpy.percentile(map_utilizations, percent), numpy.percentile(reduce_utilizations, percent), numpy.percentile(all_utilizations, percent))) f.close()
def bppd_filter(self, images): """ 1. RBG --> HSV 2. Set minimum saturation equal to the mean saturation 3. Set minimum value equal to the mean value 4. Take hues within range from green-yellow to green-blue """ if self.config['VERBOSE']: self.log_msg('BPPD', 'NOTE: Filtering for plants ...') if images == []: raise Exception("No input image(s)!", important=True) a = time.time() masks = [] threshold_min = np.array([self.config['HUE_MIN'], self.config['SAT_MIN'], self.config['VAL_MIN']], np.uint8) threshold_max = np.array([self.config['HUE_MAX'], self.config['SAT_MAX'], self.config['VAL_MAX']], np.uint8) for bgr in images: if bgr is not None: try: hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV) threshold_min[1] = np.percentile(hsv[:,:,1], 100 * self.config['SAT_MIN'] / 255.0) threshold_min[2] = np.percentile(hsv[:,:,2], 100 * self.config['VAL_MIN'] / 255.0) threshold_max[1] = np.percentile(hsv[:,:,1], 100 * self.config['SAT_MAX'] / 255.0) threshold_max[2] = np.percentile(hsv[:,:,2], 100 * self.config['VAL_MAX'] / 255.0) mask = cv2.inRange(hsv, threshold_min, threshold_max) kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(self.config['KERNEL_XY'],self.config['KERNEL_XY'])) mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel) masks.append(mask) if self.config['VERBOSE']: self.log_msg('BPPD', 'OK: Mask #%d was successful' % len(masks)) except Exception as error: self.log_msg('BPPD', str(error), important=True) else: if self.config['VERBOSE']: self.log_msg('BPPD', 'WARN: Mask #%d is blank' % len(masks), important=True) masks.append(None) b = time.time() if self.config['VERBOSE']: self.log_msg('BPPD', '... %.2f ms' % ((b - a) * 1000)) return masks
def _auto_limits(self): if self.component_data is None: return exclude = (100 - self.percentile) / 2. # For subsets in 'data' mode, we want to compute the limits based on # the full dataset, not just the subset. if isinstance(self.data, Subset): data_values = self.data.data[self.component_id] else: data_values = self.data[self.component_id] try: lower = np.nanpercentile(data_values, exclude) upper = np.nanpercentile(data_values, 100 - exclude) except AttributeError: # Numpy < 1.9 data_values = data_values[~np.isnan(data_values)] lower = np.percentile(data_values, exclude) upper = np.percentile(data_values, 100 - exclude) if isinstance(self.data, Subset): lower = 0 self.set_limits(lower, upper)
def _determine_cmap_params(self, plot_data, vmin, vmax, cmap, center, robust): """Use some heuristics to set good defaults for colorbar and range.""" calc_data = plot_data.data[~np.isnan(plot_data.data)] if vmin is None: vmin = np.percentile(calc_data, 2) if robust else calc_data.min() if vmax is None: vmax = np.percentile(calc_data, 98) if robust else calc_data.max() self.vmin, self.vmax = vmin, vmax # Choose default colormaps if not provided if cmap is None: if center is None: self.cmap = cm.rocket else: self.cmap = cm.icefire elif isinstance(cmap, string_types): self.cmap = mpl.cm.get_cmap(cmap) elif isinstance(cmap, list): self.cmap = mpl.colors.ListedColormap(cmap) else: self.cmap = cmap # Recenter a divergent colormap if center is not None: vrange = max(vmax - center, center - vmin) normlize = mpl.colors.Normalize(center - vrange, center + vrange) cmin, cmax = normlize([vmin, vmax]) cc = np.linspace(cmin, cmax, 256) self.cmap = mpl.colors.ListedColormap(self.cmap(cc))
def plot_wavenvelope(self, ax, w_start, w_end): """ This function plots the envelope of the recording. :param ax: The axis in which you wish to plot. :param w_start: Start of the best window. :param w_end: End of the best window. """ window_size = int(0.05 * self._sample_rate) # 0.050 are 50 milliseconds for the envelope window! w = 1.0 * np.ones(window_size) / window_size envelope = (np.sqrt((np.correlate(self._eod ** 2, w, mode='same') - np.correlate(self._eod, w, mode='same') ** 2)).ravel()) * np.sqrt(2.) upper_bound = np.max(envelope) + np.percentile(envelope, 1) ax.fill_between(self._time[::500], y1=-envelope[::500], y2=envelope[::500], color='purple', alpha=0.5) ax.plot((w_start, w_start), (-upper_bound, upper_bound), 'k--', linewidth=2) ax.plot((w_end, w_end), (-upper_bound, upper_bound), 'k--', linewidth=2) ax.text((w_start + w_end) / 2., upper_bound - np.percentile(envelope, 10), 'Analysis Window', rotation='horizontal', horizontalalignment='center', verticalalignment='center', fontsize=14) ax.set_ylim(-upper_bound, upper_bound) ax.set_xlabel('Time [s]', fontsize=16) ax.set_ylabel('Signal Amplitude [au]', fontsize=16) ax.tick_params(axis='both', which='major', labelsize=14) pass
def evaluate(im, algo, gt_illuminant, i, range_thresh, bin_num, dst_folder): new_im = None start_time = timeit.default_timer() if algo=="grayworld": new_im = cv2.xphoto.autowbGrayworld(im, 0.95) elif algo=="nothing": new_im = im elif algo=="learning_based": new_im = cv2.xphoto.autowbLearningBased(im, None, range_thresh, 0.98, bin_num) elif algo=="GT": gains = gt_illuminant / min(gt_illuminant) g1 = float(1.0 / gains[2]) g2 = float(1.0 / gains[1]) g3 = float(1.0 / gains[0]) new_im = cv2.xphoto.applyChannelGains(im, g1, g2, g3) time = 1000*(timeit.default_timer() - start_time) #time in ms if len(dst_folder)>0: if not os.path.exists(dst_folder): os.makedirs(dst_folder) im_name = ("%04d_" % i) + algo + ".jpg" cv2.imwrite(os.path.join(dst_folder, im_name), stretch_to_8bit(new_im)) #recover the illuminant from the color balancing result, assuming the standard model: estimated_illuminant = [0, 0, 0] eps = 0.01 estimated_illuminant[2] = np.percentile((im[:,:,0] + eps) / (new_im[:,:,0] + eps), 50) estimated_illuminant[1] = np.percentile((im[:,:,1] + eps) / (new_im[:,:,1] + eps), 50) estimated_illuminant[0] = np.percentile((im[:,:,2] + eps) / (new_im[:,:,2] + eps), 50) res = np.arccos(np.dot(gt_illuminant,estimated_illuminant)/ (np.linalg.norm(gt_illuminant) * np.linalg.norm(estimated_illuminant))) return (time, (res / np.pi) * 180)
def get_mean_vmax(): hostvmaxs = [] hostvmax25s = [] hostvmax75s = [] twentyfifth, fifty, seventyfifth = get_percentile() rootdir = "/Users/catherinefielder/Documents/Research_Halos/HaloDetail" for subdir, dirs, files in os.walk(rootdir): head, tail = os.path.split(subdir) haloname = tail for file in files: if file.endswith("_columnsadded_final"): values = ascii.read( os.path.join(subdir, file), format="commented_header" ) # Get full path and access file hostvmax = values[1]["host_vmax"] hostvmaxs = np.append(hostvmaxs, hostvmax) twentyfifth = np.percentile(hostvmaxs, 25) seventyfifth = np.percentile(hostvmaxs, 75) for i in range(0, len(hostvmaxs)): if hostvmaxs[i] >= seventyfifth: hostvmax75s = np.append(hostvmax75s, hostvmaxs[i]) elif hostvmaxs[i] < twentyfifth: hostvmax25s = np.append(hostvmax25s, hostvmaxs[i]) else: continue sumvmax = np.sum(hostvmaxs) meanvmax = np.divide(sumvmax, len(hostvmaxs)) mean75 = np.mean(hostvmax75s) mean25 = np.mean(hostvmax25s) print "mean" print meanvmax print mean75 print mean25 return meanvmax, mean75, mean25
def plotKineticsScatter(kinArr, outputFileName): handles = [] colors = ['red', 'green', 'blue', 'magenta'] bases = ['A', 'C', 'G', 'T'] fig, ax = _createFigTemplate(dims=(10, 8)) for i in xrange(4): baseHits = kinArr[kinArr['base'] == bases[i]] if baseHits.shape[0] > 0: # Add a bit of scatter to avoid ugly aliasing in plot due to # integer quantization cov = baseHits['coverage'] + 0.25 * \ np.random.randn(baseHits.shape[0]) score = baseHits['score'] + 0.25 * \ np.random.randn(baseHits.shape[0]) pl = ax.scatter(cov, score, c=colors[i], label=bases[ i], lw=0, alpha=0.3, s=12) handles.append(pl) ax.set_xlabel('Per-Strand Coverage') ax.set_ylabel('Modification QV') plt.legend(handles, bases, loc='upper left') if kinArr.shape[0] > 0: ax.set_xlim(0, np.percentile(kinArr['coverage'], 95.0) * 1.4) ax.set_ylim(0, np.percentile(kinArr['score'], 99.9) * 1.3) fig.savefig(outputFileName, dpi=72) plt.close(fig)
def updateStats(self): if self.current_layer is not None: current_attribute = self.dlg.getCurrentAttribute() if current_attribute >= 0: attribute = self.layer_attributes[current_attribute] # check if stats have been calculated before idx = self.checkValuesAvailable(attribute) if idx == -1: self.retrieveAttributeValues(attribute) idx = len(self.attribute_statistics)-1 stats = self.attribute_statistics[idx] # calculate stats of selected objects only select_stats = dict() if self.current_layer.selectedFeatureCount() > 0: self.selection_values, self.selection_ids = uf.getFieldValues(self.current_layer, attribute['name'], null=False, selection=True) sel_values = [val for val in self.selection_values if val != NULL] select_stats['Number'] = len(sel_values) select_stats['Mean'] = uf.truncateNumber(np.mean(sel_values)) select_stats['Std Dev'] = uf.truncateNumber(np.std(sel_values)) select_stats['Variance'] = uf.truncateNumber(np.var(sel_values)) select_stats['Median'] = uf.truncateNumber(np.median(sel_values)) select_stats['Minimum'] = np.min(sel_values) select_stats['Maximum'] = np.max(sel_values) select_stats['Range'] = uf.truncateNumber(select_stats['Maximum']-select_stats['Minimum']) select_stats['1st Quart'] = uf.truncateNumber(np.percentile(sel_values,25)) select_stats['3rd Quart'] = uf.truncateNumber(np.percentile(sel_values,75)) select_stats['IQR'] = uf.truncateNumber(select_stats['3rd Quart']-select_stats['1st Quart']) select_stats['Gini'] = uf.roundNumber(uf.calcGini(sel_values)) else: self.selection_values = [] self.selection_ids = [] # update the dialog self.dlg.setStats(stats, select_stats)
def stat_info(data): import matplotlib.pyplot as plt D = np.ravel(data) U = np.unique(D) if len(U)>1: sep = np.min(U[1:]-U[:-1]) N = min(100, int(np.ceil((np.max(D)-np.min(D))/sep))) else: N = 1 mean = np.mean(D) std = np.std(D) fig, ax = plt.subplots(2,1,figsize=(21,4)) ax[0].boxplot(D, 0, 'ro', 0); ax[1].hist(D, N, density=True); ax[1].axvline(mean, color='r', label='mean') ax[1].axvline(mean+std, color='r', linestyle='--', label='1$\\sigma$') ax[1].axvline(mean-std, color='r', linestyle='--', label='1$\\sigma$') if mean-2*std >= U[0]: ax[1].axvline(mean-2*std, color='r', linestyle=':', label='2$\\sigma$') if mean+2*std <= U[-1]: ax[1].axvline(mean+2*std, color='r', linestyle=':', label='2$\\sigma$') ax[1].legend(); print("Stats") print("\tAverage:", mean) print("\tStandard-deviation:", std) print("\tMinimum:", np.min(D)) print("\tQ1:", np.percentile(D, 25)) print("\tMedian:", np.percentile(D, 50)) print("\tQ3:", np.percentile(D, 75)) print("\tMaximum:", np.max(D))
def getMed(x): if len(x) == 0: x = np.array([0]) median = np.percentile(x, 50) sigma_min = median - np.percentile(x, 16) sigma_max = np.percentile(x, 84) - median return median, sigma_min, sigma_max
def _plot_distribution(z, lat, lev, fig, ax, figpath, titlestr, xstr, xl=None, xu=None, bins=None): """Plots a stack of histograms of log10(data) at all levels""" # Initialize the bins and the frequency num_bins = 100 if bins is None: bins = np.linspace(np.percentile(z, .02), np.percentile(z, 99.98), num_bins+1) n = np.zeros((num_bins, lev.size)) # Calculate distribution at each level for i in range(lev.size): n[:, i], _ = np.histogram(z[:, i], bins=bins) # Take a logarithm and deal with case where we take log of 0 n = np.log10(n) n_small = np.amin(n[np.isfinite(n)]) n[np.isinf(n)] = n_small # Plot histogram ca = ax.contourf(bins[:-1], lev, n.T) ax.set_ylim(1, 0) if xl is not None: ax.set_xlim(xl, xu) plt.colorbar(ca, ax=ax) ax.set_xlabel(xstr) ax.set_ylabel(r'$\sigma$') ax.set_title(titlestr) xl, xr = ax.set_xlim() return xl, xr, bins
def handle_data(self): current_time = self.current_datetime try: location = self.date_index.get_loc(current_time) except KeyError: return if location >= 99: histories = self.signals.factor[location-99:location] current_level = histories[-1] upper = np.percentile(histories, 95) lower = np.percentile(histories, 5) mid_upper = np.percentile(histories, 75) mid_lower = np.percentile(histories, 25) if current_level > upper: self.order_to('ru.xsge', 1, 1) elif current_level < lower: self.order_to('ru.xsge', -1, 1) #elif mid_lower < current_level < mid_upper: # self.order_to('ru.cffex', 1, 0) self.keep('factor', current_level) self.keep('factor (95%)', upper) self.keep('factor (5%)', lower) self.keep('factor (75%)', mid_upper) self.keep('factor (25%)', mid_lower) self.keep('ru.xsge', self.close['ru.xsge']) else: return
def meanPlot(self, scale, xIndex=0, yIndex=1): i = 0 nxti = 0 num = 0 sumTimes = 0 numItems = 0 setNxt = False x = [] y = [] error = [[], []] tmp = [] while i < len(self.log[xIndex]): if self.log[xIndex][i] > (num + 1) * scale: if numItems != 0: x.append(num * scale) y.append(np.percentile(tmp, 50)) error[0].append(np.percentile(tmp, 25)) error[1].append(np.percentile(tmp, 75)) i = nxti num += 1 tmp = [] numItems = 0 setNxt = False if self.log[xIndex][i] >= (num - 1) * scale: tmp.append(self.log[yIndex][i]) numItems += 1 if not setNxt: setNxt = True nxti = i i += 1 c = plt.plot(x, y, zorder=10)[0].get_color() plt.fill_between(x, error[0], error[1], color=c, alpha="0.25", zorder=0) plt.show(block=False)
def viz_docwordfreq_sidebyside(P1, P2, title1='', title2='', vmax=None, aspect=None, block=False): from matplotlib import pylab pylab.figure() if vmax is None: vmax = 1.0 P1limit = np.percentile(P1.flatten(), 97) if P2 is not None: P2limit = np.percentile(P2.flatten(), 97) else: P2limit = P1limit while vmax > P1limit and vmax > P2limit: vmax = 0.8 * vmax if aspect is None: aspect = float(P1.shape[1])/P1.shape[0] pylab.subplot(1, 2, 1) pylab.imshow(P1, aspect=aspect, interpolation='nearest', vmin=0, vmax=vmax) if len(title1) > 0: pylab.title(title1) if P2 is not None: pylab.subplot(1, 2, 2) pylab.imshow(P2, aspect=aspect, interpolation='nearest', vmin=0, vmax=vmax) if len(title2) > 0: pylab.title(title2) pylab.show(block=block)
def BootstrapSc(Method, Data, n=10000): """ Bootstrap the calculation of the best fit Sc value n times to get the 95% confidence interval for the best fit Sc. Values of n larger than 10000 will take a long time to run. """ tmp = [] # need to convert the LH,R,CHT data into a serial 1D array before bootstrapping if Method == "raw": for i in range(len(Data[0])): tmp.append(SerializeData(Data[2][i], Data[4][i], Data[3][i])) if Method == "patches": for i in range(len(Data[0])): tmp.append(SerializeData(Data[2][i], Data[10][i], Data[6][i])) if Method == "basins": for i in range(len(Data[0])): tmp.append(SerializeData(Data[5][i], Data[7][i], Data[6][i])) ToSample = np.array(tmp) Scs = [] i = 0 while i < n: print i sample = np.random.choice(ToSample, len(ToSample), replace=True) LH, R, CHT = UnserializeList(sample) sc, _, _, _, _ = optimize.leastsq(Residuals, 0.8, args=(R, LH, CHT), full_output=True) if sc < 2.0: Scs.append(sc[0]) i += 1 # mean upper bound lower bound return np.mean(Scs), np.percentile(Scs, 97.5) - np.mean(Scs), np.mean(Scs) - np.percentile(Scs, 2.5)
def init_model(dataset, metadata, model_path, surprise_depth, experiment): #Initialise the VAE from the given file. dataset_changed = False print "Initalising a VAE from the model file at",model_path+"." #model = globals()[metadata['model_class']](dataset, "data/",selected_hypers=metadata["experiments"][experiment]["best_hypers"]) model = globals()[metadata['model_class']](dataset, "data/",selected_hypers=metadata["best_hypers"]) if "monary_type" in metadata.keys(): model.set_monary_type(metadata["monary_type"]) model.load(model_path) model.init_model_functions() model.metadata = metadata conditional_dists_file = model_path[:-4]+"_surpdist_"+str(surprise_depth)+".csv" metadata["experiments"][experiment]["surprise_distribution"] = model.precalculate_conditional_dists(from_file=True,file_path=conditional_dists_file, depth=surprise_depth) if any(k not in metadata["experiments"][experiment].keys() for k in ["plausibility_distribution","errors_by_length","hidden_rep_averages"]): print "Generating distribution over plausibility for each design in the dataset." plausibilities, errors_by_length,hidden_rep_averages = model.get_dataset_errors(metadata, return_averages_by_length=True, return_hidden_rep_averages=True) print "plausibilities.shape",plausibilities.shape plaus_dist = {} plaus_dist["min"] = float(np.amin(plausibilities)) plaus_dist["max"] = float(np.amax(plausibilities)) plaus_dist["5%"] = float(np.percentile(plausibilities, 5)) plaus_dist["95%"] = float(np.percentile(plausibilities, 95)) plaus_dist["mean"] = float(np.average(plausibilities)) print "plaus_dist",plaus_dist metadata["experiments"][experiment]["plausibility_distribution"] = plaus_dist metadata["experiments"][experiment]["errors_by_length"] = errors_by_length metadata["experiments"][experiment]["hidden_rep_averages"] = hidden_rep_averages dataset_changed = True if dataset_changed: print "Saving updates to the",dataset,"dataset entry." client = pymongo.MongoClient() db = client.creeval db.datasets.save(metadata) return model
def test_random_posterior(self): ndraws = 100000 ssqr_draws = np.empty(ndraws) for i in xrange(ndraws): ssqr_draws[i] = self.sigsqr.random_posterior() nu = self.sigsqr.nu prior_ssqr = self.sigsqr.lamb post_dof = nu + len(self.y) post_ssqr = (nu * prior_ssqr + self.y.size * np.var(self.sigsqr.bart_step.resids)) / post_dof igam_shape = post_dof / 2.0 igam_scale = post_dof * post_ssqr / 2.0 igamma = stats.distributions.invgamma(igam_shape, scale=igam_scale) # test draws from conditional posterior by comparing 1st and 2nd moments to true values true_mean = igamma.moment(1) frac_diff = np.abs(true_mean - ssqr_draws.mean()) / true_mean rpmsg = "Fractional difference in mean from BartVariance.random_posterior() is greater than 2%" self.assertLess(frac_diff, 0.02, msg=rpmsg) true_ssqr = igamma.moment(2) frac_diff = np.abs(true_ssqr - (ssqr_draws.var() + ssqr_draws.mean() ** 2)) / true_ssqr rpmsg = "Fractional difference in 2nd moment from BartVariance.random_posterior() is greater than 2%" self.assertLess(frac_diff, 0.02, msg=rpmsg) # make sure gibbs sampler constrains the correct value ssqr_low = np.percentile(ssqr_draws, 1.0) ssqr_high = np.percentile(ssqr_draws, 99.0) rpmsg = "Value of Variance parameter returned by Gibbs sampler is outside of 99% credibility interval." self.assertGreater(self.true_sigsqr, ssqr_low, msg=rpmsg) self.assertLess(self.true_sigsqr, ssqr_high, msg=rpmsg)
def show_bootstrap_statistics(clf, X, y, features): num_features = len(features) coefs = [] for i in range(num_features): coefs.append([]) for _ in range(BOOTSTRAP_ITERATIONS): X_sample, y_sample = resample(X, y) clf.fit(X_sample, y_sample) for i, c in enumerate(get_normalized_coefs(clf)): coefs[i].append(c) poi_index = features.index('POI') building_index = features.index('Building') coefs[building_index] = coefs[poi_index] intervals = [] print() print('***** Bootstrap statistics *****') print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)')) print() for i, cs in enumerate(coefs): values = np.array(cs) lo = np.percentile(values, 2.5) hi = np.percentile(values, 97.5) interval = '({:.3f}, {:.3f})'.format(lo, hi) tv = np.mean(values) / np.std(values) pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5 stv = '{:.3f}'.format(tv) spr = '{:.3f}'.format(pr) print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
def test_quantile(self): from numpy import percentile q = self.ts.quantile(0.1) self.assertEqual(q, percentile(self.ts.valid(), 10)) q = self.ts.quantile(0.9) self.assertEqual(q, percentile(self.ts.valid(), 90)) # object dtype q = Series(self.ts, dtype=object).quantile(0.9) self.assertEqual(q, percentile(self.ts.valid(), 90)) # datetime64[ns] dtype dts = self.ts.index.to_series() q = dts.quantile(.2) self.assertEqual(q, Timestamp('2000-01-10 19:12:00')) # timedelta64[ns] dtype tds = dts.diff() q = tds.quantile(.25) self.assertEqual(q, pd.to_timedelta('24:00:00')) # GH7661 result = Series([np.timedelta64('NaT')]).sum() self.assertTrue(result is pd.NaT) msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with tm.assertRaisesRegexp(ValueError, msg): self.ts.quantile(invalid)
def ampDiffStats(ampIm1, ampIm2, osIm1, osIm2, exptime=0.0): stats = np.zeros(shape=(1,), dtype=statDtype) a_i = 0 _s1 = np.median(ampIm1) - np.median(osIm1) _s2 = np.median(ampIm2) - np.median(osIm2) stats[a_i]['signal'] = signal = (_s1 + _s2)/2 stats[a_i]['npix'] = ampIm1.size stats[a_i]['sqrtSig'] = np.sqrt(signal) stats[a_i]['bias'] = (np.median(osIm1) + np.median(osIm2))/2 ampIm = ampIm2.astype('f4') - ampIm1 osIm = osIm2.astype('f4') - osIm1 sig1 = (0.741/np.sqrt(2)) * np.subtract.reduce(np.percentile(ampIm, [75,25])) sig2 = (0.741/np.sqrt(2)) * np.subtract.reduce(np.percentile(osIm, [75,25])) _, trusig1, _ = geom.clippedStats(ampIm) / np.sqrt(2) _, trusig2, _ = geom.clippedStats(osIm) / np.sqrt(2) stats[a_i]['readnoise'] = sig2 stats[a_i]['readnoiseM'] = trusig2 stats[a_i]['shotnoise'] = sig = np.sqrt(np.abs(sig1**2 - sig2**2)) stats[a_i]['shotnoiseM'] = trusig = np.sqrt(np.abs(trusig1**2 - trusig2**2)) stats[a_i]['gain'] = gain = signal/sig**2 stats[a_i]['gainM'] = signal/trusig**2 stats[a_i]['noise'] = sig2*gain stats[a_i]['flux'] = signal/exptime if exptime != 0 else 0.0 return stats, ampIm, osIm
def get_stat_function(statistics, perc=None): # Define personalized functions for binned_statistics if (statistics == 'mean') | (statistics == 'median'): stat_func = statistics elif statistics == 'std': stat_func = np.std elif statistics == 'mse': stat_func = lambda x: np.mean(x**2) elif statistics == 'frac': # stat_func = lambda x: 100.0*np.abs(np.mean(x))/(np.abs(np.mean(x)) + np.std(x)) Wrong decomposition stat_func = lambda x: np.sign(np.mean(x))*100.0*np.mean(x)**2/(np.mean(x)**2 + np.std(x)**2) elif statistics == 'cv': stat_func_ratio = lambda x: np.std(x)/np.mean(x) stat_func_diff = lambda x: np.std(x) - np.abs(np.mean(x)) # To compute the CV for an already multiplicative variable (GD) elif statistics == 'iqr': stat_func = lambda x: np.percentile(x,75) - np.percentile(x,25) elif statistics == 'percentile': if perc == None: print('Do not forget to pass the wanted percentile. I will use 50 by default...') perc = 50 stat_func = lambda x: np.percentile(x, perc) else: print('Wrong statistics asked:', statistics) sys.exit(1) return(stat_func)
def descriptive_stats(array, verbose=True, label='', mean=False, plot=False): """ Simple statistics from vector. """ if mean: mean_ = np.mean(array) median = np.median(array) mini = np.min(array) maxi = np.max(array) first_qu = np.percentile(array, 25) third_qu = np.percentile(array, 75) if verbose: if mean: label += 'min={:.1f} / 1st QU={:.1f} / ave={:.1f} / med={:.1f} / ' label += '3rd QU={:.1f} / max={:.1f}' print(label.format(mini, first_qu, mean_, median, third_qu, maxi)) else: label += 'min={:.1f} / 1st QU={:.1f} / med={:.1f} / 3rd QU={:.1f} ' label += '/ max={:.1f}' print(label.format(mini, first_qu, median, third_qu, maxi)) if plot: boxplot(array, vert=False, meanline=mean, showfliers=True, sym='.') if mean: return mini, first_qu, mean_, median, third_qu, maxi else: return mini, first_qu, median, third_qu, maxi
def write_parameters_outputvalues(self, P): Mstar, SFR_opt, _ = model.stellar_info_array(self.chain.flatchain_sorted, self.data, self.out['realizations2int']) column_names = np.transpose(np.array(["P025","P16","P50","P84","P975"], dtype='|S3')) chain_pars = np.column_stack((self.chain.flatchain_sorted, Mstar, SFR_opt)) # np.mean(chain_pars, axis[0]), # np.std(chain_pars, axis[0]), if self.out['calc_intlum']: SFR_IR = model.sfr_IR(self.int_lums[0]) #check that ['intlum_names'][0] is always L_IR(8-100) chain_others =np.column_stack((self.int_lums.T, SFR_IR)) outputvalues = np.column_stack((np.transpose(map(lambda v: (v[0],v[1],v[2],v[3],v[4]), zip(*np.percentile(chain_pars, [2.5,16, 50, 84,97.5], axis=0)))), np.transpose(map(lambda v: (v[0],v[1],v[2],v[3],v[4]), zip(*np.percentile(chain_others, [2.5,16, 50, 84,97.5], axis=0)))) )) outputvalues_header= ' '.join([ i for i in np.hstack((P.names, 'Mstar', 'SFR_opt', self.out['intlum_names'], 'SFR_IR',))] ) else: outputvalues = np.column_stack((map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]), zip(*np.percentile(chain_pars, [16, 50, 84], axis=0))))) outputvalues_header=' '.join( [ i for i in P.names] ) return outputvalues, outputvalues_header
def arrivals(self, stories, state=6): ''' Chart a plot point for every arrival time in state ''' arrivals = self.release.kanban().state_arrival_interval(state) dates = [a['date'] for a in arrivals] arrivals = [round(a['interval']/60./60., 1) for a in arrivals] average = numpy.median([arrivals]) std = numpy.std([arrivals]) iql = numpy.percentile([arrivals], 25) iqh = numpy.percentile([arrivals], 75) nsul = [] nsuw = [] nsll = [] nslw = [] avg = [] for x in arrivals: nsul.append(average + (iqh * 3)) nsuw.append(average + (iqh * 2)) nslw.append(average - (iql * 2)) nsll.append(average - (iql * 3)) avg.append(average) pyplot.plot(dates, arrivals, '*', color='g') pyplot.plot(dates, nsul, 'o', linestyle='-', color='r') pyplot.plot(dates, nsuw, '.', linestyle=':', color='y') pyplot.plot(dates, nslw, '.', linestyle=':', color='y') pyplot.plot(dates, nsll, 'o', linestyle='-', color='r') pyplot.plot(dates, avg, '',linestyle='-.', markerfacecolor='None') pyplot.show(block=False)
def test_random_posterior(self): # first get values of mu drawn from their conditional posterior ndraws = 100000 nleaves = len(self.mu.value) mu_draws = np.empty((ndraws, nleaves)) for i in xrange(ndraws): mu_draws[i, :] = self.mu.random_posterior() l_idx = 0 for leaf in self.mu.treeparam.value.terminalNodes: ny = leaf.npts ybar = leaf.ybar post_var = 1.0 / (1.0 / self.mu.prior_var + ny / self.mu.sigsqr.value) post_mean = post_var * (self.mu.mubar / self.mu.prior_var + ny * ybar / self.mu.sigsqr.value) # test draws from conditional posterior by comparing 1st and 2nd moments to true values zscore = np.abs((post_mean - mu_draws[:, l_idx].mean())) / np.sqrt(post_var / ndraws) rpmsg = "Sample mean from BartMeanParameter.random_posterior() differs by more than 3-sigma." self.assertLess(zscore, 3.0, msg=rpmsg) frac_diff = np.abs(np.sqrt(post_var) - mu_draws[:, l_idx].std()) / np.sqrt(post_var) rpmsg = "Fractional difference in standard deviation from BartMeanParameter.random_posterior() is greater" \ + " than 2%" self.assertLess(frac_diff, 0.02, msg=rpmsg) # make sure gibbs sampler constrains the correct value mu_low = np.percentile(mu_draws[:, l_idx], 1.0) mu_high = np.percentile(mu_draws[:, l_idx], 99.0) rpmsg = "Value of Terminal Node output parameter returned by Gibbs sampler is outside of 99% credibility" \ + " interval.\n Violated: " + str(mu_low) + ' < ' + str(self.true_mu[l_idx]) + ' < ' + str(mu_high) self.assertGreater(self.true_mu[l_idx], mu_low, msg=rpmsg) self.assertLess(self.true_mu[l_idx], mu_high, msg=rpmsg) l_idx += 1
def run(data_path, str_class, algs=["DT", "MLP", "RF"]): data = pd.read_csv(data_path) folds = 10 rep = 1 X = data.drop([str_class], axis=1).values y = data[str_class].values perceltil_inf = [0, 1.5, 2.5, 3.5] + [5 * i for i in range(1, 7)] perceltil_sup = [(100 - perceltil_inf[i]) for i in range(len(perceltil_inf))] range_high_TG = [ np.percentile(y, perceltil_sup[i]) for i in range(len(perceltil_sup)) ] # range_high_TG.reverse() range_low_TG = [ np.percentile(y, perceltil_inf[i]) for i in range(len(perceltil_inf)) ] range_high_TG = np.round(range_high_TG, 2) range_low_TG = np.round(range_low_TG, 2) print(range_low_TG) print(perceltil_inf) print(range_high_TG) print(perceltil_sup) dic_oracle = {} data = [] for low, low_perc in zip(range_low_TG, perceltil_inf): for high, high_perc in zip(range_high_TG, perceltil_sup): for alg_low in algs: for alg_middle in algs: for alg_high in algs: line = [] line = [ low_perc, 100 - (low_perc + 100 - high_perc), 100 - high_perc ] for i in [alg_low, alg_middle, alg_high]: line = line + [ 1 if i == "MLP" else 0, 1 if i == "RF" else 0, 1 if i == "DT" else 0 ] res = oracle(low, high, alg_low, alg_middle, alg_high) line = line + res.tolist() dic_oracle["{0}-{1}_{2}_{3}_{4}".format( low, high, alg_low, alg_middle, alg_high)] = line data.append(line) cols_name = [ "S", "M", "E", "S_MLP", "S_RF", "S_DT", "M_MLP", "M_RF", "M_DT", "E_MLP", "E_RF", "E_DT", "Global_mean_MAE", "Global_mean_MSE", "Global_mean_R2_S", "Global_mean_RRMSE", "Global_mean_RMSE", "Global_mean_MARE", "Global_mean_R2", "Global_sd_MAE", "Global_sd_MSE", "Global_sd_R2_S", "Global_sd_RRMSE", "Global_sd_RMSE", "Global_sd_MARE", "Global_sd_R2", "Local_S_mean_MAE", "Local_S_mean_MSE", "Local_S_mean_R2_S", "Local_S_mean_RRMSE", "Local_S_mean_RMSE", "Local_S_mean_MARE", "Local_S_mean_R2", "Local_S_sd_MAE", "Local_S_sd_MSE", "Local_S_sd_R2_S", "Local_S_sd_RRMSE", "Local_S_sd_RMSE", "Local_S_sd_MARE", "Local_S_sd_R2", "Local_M_mean_MAE", "Local_M_mean_MSE", "Local_M_mean_R2_S", "Local_M_mean_RRMSE", "Local_M_mean_RMSE", "Local_M_mean_MARE", "Local_M_mean_R2", "Local_M_sd_MAE", "Local_M_sd_MSE", "Local_M_sd_R2_S", "Local_M_sd_RRMSE", "Local_M_sd_RMSE", "Local_M_sd_MARE", "Local_M_sd_R2", "Local_E_mean_MAE", "Local_E_mean_MSE", "Local_E_mean_R2_S", "Local_E_mean_RRMSE", "Local_E_mean_RMSE", "Local_E_mean_MARE", "Local_E_mean_R2", "Local_E_sd_MAE", "Local_E_sd_MSE", "Local_E_sd_R2_S", "Local_E_sd_RRMSE", "Local_E_sd_RMSE", "Local_E_sd_MARE", "Local_E_sd_R2" ] df = pd.DataFrame(data, columns=cols_name) df.to_csv('../result/evaluating_range/ranges2.csv') return df
def combine_flat( files, instrument, mode, extension=None, bhead=None, bias=None, plot=False, plot_title=None, bias_scaling="number_of_files", **kwargs, ): """ Combine several flat files into one master flat Parameters ---------- files : list(str) flat files instrument : str instrument mode for modinfo extension: {int, str}, optional fits extension to use (default: 1) bias: array(int, float), optional bias image to subtract from master flat (default: 0) xr: 2-tuple(int), optional x range to use (default: None, i.e. whole image) yr: 2-tuple(int), optional y range to use (default: None, i.e. whole image) dtype : np.dtype, optional datatype of the combined bias frame (default: float32) Returns ------- flat, fhead image and header of master flat """ flat, fhead = combine_frames(files, instrument, mode, extension, **kwargs) # Subtract master dark # TODO: Why do we scale with number of files and not exposure time? if bias is not None: if bias_scaling == "number_of_files": flat -= bias * len(files) elif bias_scaling == "exposure_time": flat -= bias * fhead["exptime"] / bhead["exptime"] elif bias_scaling == "mean": flat -= bias * np.ma.mean(flat) / np.ma.mean(bias) elif bias_scaling == "median": flat -= bias * np.ma.median(flat) / np.ma.median(bias) else: raise ValueError( "Unexpected value for 'bias_scaling', expected one of ['number_of_files', 'exposure_time'], but got %s" % bias_scaling) if plot: # pragma: no cover title = "Master Flat" if plot_title is not None: title = f"{plot_title}\n{title}" plt.title(title) plt.xlabel("x [pixel]") plt.ylabel("y [pixel]") bot, top = np.percentile(flat, (10, 90)) plt.imshow(flat, vmin=bot, vmax=top, origin="lower") plt.show() return flat, fhead
print('Too many labels: ', too_many_labels) print('no_labelled_data: ', no_labelled_data) tpl = [(bidirectional_flow_lengths, '_bidir.csv'), (forward_flow_lengths, '_fwd.csv'), (backward_flow_lengths, '_bwd.csv')] for flow_lengths, output_ext in tpl: percentile = np.array([ 'Label', 'num_flows', 'Min', '20-th', '50-th', '90-th percentile', '95-th percentile', '99-th percentile', '99.9-th percentile', '100-th percentile' ]) percentile = percentile.reshape((1, -1)) for label in label_names: flow_lengths_for_label = flow_lengths[label] print("{:40s}-->{:10d}".format(label, len(flow_lengths_for_label))) if len(flow_lengths_for_label) < 1: continue flow_lengths_for_label = np.array(flow_lengths_for_label) row = np.array([label,len(flow_lengths_for_label),np.min(flow_lengths_for_label),\ np.percentile(flow_lengths_for_label,20),np.percentile(flow_lengths_for_label,50),\ np.percentile(flow_lengths_for_label,90),np.percentile(flow_lengths_for_label,95),np.percentile(flow_lengths_for_label,99),\ np.percentile(flow_lengths_for_label,99.9), np.percentile(flow_lengths_for_label,100)]) percentile = np.concatenate((percentile, row.reshape((1, -1))), axis=0) np.savetxt(output_filename.replace('.csv', output_ext), percentile, fmt='%s', delimiter=',')
@author: rian-van-den-ander """ import numpy as np import pandas as pd dataset = pd.read_csv('personality_data.csv', header=0, sep='\t') """ Data cleansing ---------- """ dataset = dataset.dropna() #drop any null data dataset = dataset[dataset.age < 100] # Removing bogus age dataset = dataset[dataset.gender.isin([1, 2])] # removing non specific genders dataset = dataset[dataset.accuracy > np.percentile( dataset.accuracy, 5)] # removing very low accuracies dataset = dataset[dataset.accuracy <= 100] # Removing very high accuracies dataset = dataset[dataset.elapsed <= 5000] # Removing very high accuracies dataset = dataset[dataset.elapsed > 300] # Removing very high accuracies X = dataset.iloc[:, 0:-6].values y_age = dataset.iloc[:, -6].values y_gender = dataset.iloc[:, -5].values y_accuracy = dataset.iloc[:, -4].values y_elapsed = dataset.iloc[:, -1].values """ Data engineering ---------- """ # Adding interaction between personality items to X
def vertprofileplot(ifiles, args): if args.variables is None: raise ValueError('User must specify variable(s) to plot:\n%s' % '\n\t'.join(ifiles[0].variables.keys())) from PseudoNetCDF.coordutil import getsigmamid, getpresmid, gettimes import pylab as pl from pylab import figure, NullFormatter, close, rcParams rcParams['text.usetex'] = False from matplotlib.colors import LinearSegmentedColormap, BoundaryNorm, LogNorm scale = args.scale minmax = eval(args.minmax) minmaxq = eval(args.minmaxq) sigma = args.sigma maskzeros = args.maskzeros outunit = args.outunit tespaths = args.tespaths omipaths = args.omipaths edges = args.edges try: f, = ifiles except: raise ValueError( 'curtain plot expects one file when done. Try stack time --stack=time to concatenate' ) # Add CF conventions if necessary if 'latitude_bounds' not in f.variables.keys(): try: from PseudoNetCDF import getvarpnc from PseudoNetCDF.conventions.ioapi import add_cf_from_ioapi f = getvarpnc(f, None) add_cf_from_ioapi(f) except: pass if sigma: vertcrd = getsigmamid(f) else: vertcrd = getpresmid(f, pref=101325., ptop=getattr(f, 'VGTOP', 10000)) if vertcrd.max() > 2000: vertcrd /= 100. try: lonb = f.variables['geos_longitude_bounds'] latb = f.variables['geos_latitude_bounds'] except: lonb = f.variables['longitude_bounds'] latb = f.variables['latitude_bounds'] for var_name in args.variables: temp = defaultdict(lambda: 1) try: eval(var_name, None, temp) var = eval(var_name, None, f.variables)[:] except: temp[var_name] var = f.variables[var_name][:] if maskzeros: var = np.ma.masked_values(var, 0) vkeys = [k for k in temp.keys()] unit = f.variables[vkeys[0]].units.strip() if unit in unitconvert: var = unitconvert.get((unit, outunit), lambda x: x)(var) else: outunit = unit bmap = None vmin, vmax = np.percentile( np.ma.compressed(var).ravel(), list(minmaxq)) if minmax[0] is not None: vmin = minmax[0] if minmax[1] is not None: vmax = minmax[1] if edges: fig = pl.figure(figsize=(16, 4)) offset = 0.05 ax = fig.add_axes([.1 - offset, .15, .22, .725]) ax = fig.add_axes([.325 - offset, .15, .22, .725]) ax = fig.add_axes([.55 - offset, .15, .22, .725]) ax = fig.add_axes([.775 - offset, .15, .22, .725]) ss = 0 se = ss + f.NCOLS + 1 es = se ee = se + f.NROWS + 1 ns = ee ne = ee + f.NCOLS + 1 ws = ne we = ws + f.NROWS + 1 axs = fig.axes for ax in fig.axes[1:]: ax.yaxis.set_major_formatter(pl.NullFormatter()) vars = [ var[:, :, ss:se], var[:, :, es:ee], var[:, :, ns:ne][:, :, ::-1], var[:, :, ws:we][:, :, ::-1] ] lonbss = [ lonb[ss:se], lonb[es:ee], lonb[ns:ne][::-1], lonb[ws:we][::-1] ] latbss = [ latb[ss:se], latb[es:ee], latb[ns:ne][::-1], latb[ws:we][::-1] ] else: fig = pl.figure() ax = fig.add_subplot(111) axs = fig.axes vars = [var] if lonb.dimensions == ('longitude', 'nv') and latb.dimensions == ('latitude', 'nv'): lonbss = [lonb[:][None, :, :]] latbss = [latb[:][:, None, :]] else: lonbss = [lonb[:]] latbss = [latb[:]] for ax, var, lonbs, latbs in zip(axs, vars, lonbss, latbss): vals = var.swapaxes(0, 1).reshape(var.shape[1], -1) modl, modr = minmaxmean(ax, vals, vertcrd, facecolor='k', edgecolor='k', alpha=.2, zorder=4, label='mod (%d)' % vals.shape[1], ls='-', lw=2, color='k') llines = [(modl, modr)] ymin, ymax = vertcrd.min(), vertcrd.max() ax.set_ylim(ymax, ymin) ax.set_xscale(scale) ax.set_xlim(vmin, vmax) #if scale == 'log': # ax.set_xticklabels(['%.1f' % (10**x) for x in ax.get_xticks()]) if 'TFLAG' in f.variables.keys(): SDATE = f.variables['TFLAG'][:][0, 0, 0] EDATE = f.variables['TFLAG'][:][-1, 0, 0] STIME = f.variables['TFLAG'][:][0, 0, 1] ETIME = f.variables['TFLAG'][:][-1, 0, 1] if SDATE == 0: SDATE = 1900001 EDATE = 1900001 sdate = datetime.strptime('%07d %06d' % (SDATE, STIME), '%Y%j %H%M%S') edate = datetime.strptime('%07d %06d' % (EDATE, ETIME), '%Y%j %H%M%S') elif 'tau0' in f.variables.keys(): sdate = datetime(1985, 1, 1, 0) + timedelta(hours=f.variables['tau0'][0]) edate = datetime(1985, 1, 1, 0) + timedelta(hours=f.variables['tau1'][-1]) else: times = gettimes(f) sdate = times[0] edate = times[-1] if len(tespaths) > 0: tesl, tesr = plot_tes(ax, lonbs, latbs, tespaths) if not tesl is None: llines.append((tesl, tesr)) if len(omipaths) > 0: omil, omir = plot_omi( ax, lonbs, latbs, omipaths, airden=f.variables['AIRDEN'][:].mean(0).mean(1), airdenvert=vertcrd) if not omil is None: llines.append((omil, omir)) try: title = '%s to %s' % (sdate.strftime('%Y-%m-%d'), edate.strftime('%Y-%m-%d')) except: title = var_name if sigma: axs[0].set_ylabel('sigma') else: axs[0].set_ylabel('pressure') xmax = -np.inf xmin = np.inf for ax in fig.axes: tmp_xmin, tmp_xmax = ax.get_xlim() xmax = max(tmp_xmax, xmax) xmin = min(tmp_xmin, xmin) for ax in fig.axes: ax.set_xlim(xmin, xmax) if len(axs) == 1: axs[0].set_xlabel('%s %s' % (var_name, outunit)) else: axs[0].set_xlabel('South') axs[1].set_xlabel('East') axs[2].set_xlabel('North') axs[3].set_xlabel('West') fig.text(.5, .90, '%s %s' % (var_name, outunit), horizontalalignment='center', fontsize=16) nl = 0 for ax in axs: if len(ax.get_lines()) > nl: nl = len(ax.get_lines()) pl.sca(ax) llabels = [l[0].get_label() for l in llines] pl.legend(llines, llabels, bbox_to_anchor=(.1, 1), loc='upper left', bbox_transform=fig.transFigure, ncol=6) if edges: fig.text(0.95, 0.975, title, horizontalalignment='right', verticalalignment="top", fontsize=16) else: fig.text(0.95, 0.025, title, horizontalalignment='right', verticalalignment="bottom", fontsize=16) figpath = args.outpath + var_name + '.' + args.figformat fig.savefig(figpath) if args.verbose > 0: print('Saved fig', figpath) #pl.close(fig) return fig
def make_score_vs_rmsd_plot(self, loop): """ Create a score vs RMSD plot for the given loop. In fact two plots are made: one which includes every model and one which includes only the top 75% best scoring models. Normally the second plot is of more interest, because it focuses better on the interesting lower-left region of the plot. The full plots often have outliers that really scale the score axis. """ # This method would be much more concise if it used matplotlib. if not loop.has_data: return tsv_path = os.path.join(loop.latex_dir, 'score_vs_rmsd.tsv') gnu_path = os.path.join(loop.latex_dir, 'score_vs_rmsd.gnu') pdf_path_100 = os.path.join(loop.latex_dir, 'score_vs_rmsd_all.pdf') pdf_path_75 = os.path.join(loop.latex_dir, 'score_vs_rmsd_third_quartile.pdf') tsv_row = '{0.id}\t{0.rmsd}\t{0.score}\n' sorted_models = loop.models_sorted_by_score scores = loop.scores min_score, max_score = min(scores), max(scores) third_quartile = numpy.percentile(scores, 75) native_score = 0 # This isn't stored in the database yet. # Write score vs RMSD data to a tab-separated value (TSV) file that can # easily be parsed by gnuplot. with open(tsv_path, 'w') as file: file.write('#Model\tLoop_rmsd\tTotal_score\n') file.write('input_structure\t0.0\t{0}\n'.format(native_score)) # All models file.write('\n\n') for model in sorted_models: file.write(tsv_row.format(model)) # Top X scoring models file.write('\n\n') for model in sorted_models[:top_x]: file.write(tsv_row.format(model)) # Top scoring model file.write('\n\n') file.write(tsv_row.format(sorted_models[0])) # Write the gnuplot script and generate the EPS plots. gnuplot_script = '''\ set autoscale set border 31 set tics out set terminal pdf set xtics autofreq set xtics nomirror set ytics autofreq set ytics nomirror set noy2tics set nox2tics set style line 1 lt 1 lc rgb "dark-magenta" lw 2 set style line 2 lt 1 lc rgb "{loop.benchmark.color}" lw 2 ps 0.5 pt 7 set style line 3 lt 1 lc rgb "forest-green" lw 2 ps 2 pt 13 set style line 4 lt 1 lc rgb "dark-gray" lw 2 ps 0.5 pt 7 set style line 5 lt 1 lc rgb "black" lw 2 ps 0.8 pt 13 set style line 6 lt 1 lc rgb "black" lw 2 set style line 7 lt 1 lc rgb "dark-gray" lw 2 set style line 8 lt 1 lc rgb "gray" lw 2 set style line 9 lt 2 lc rgb "dark-gray" lw 5 set boxwidth 0.75 set key below right set xrange [0:] set encoding iso_8859_1 set title "{loop.pdb_id}: {loop.percent_subangstrom:0.2f}% sub-\305 models" set xlabel "r.m.s. deviation to crystal loop [\305]" set arrow from 1, graph 0 to 1, graph 1 ls 9 nohead set ylabel "Rosetta all-atom score" set output "{pdf_path_100}" plot "{tsv_path}" index 1 using ($2):($3) with points ls 2 title "all models" axes x1y1, \\ "{tsv_path}" index 2 using ($2):($3) with points ls 4 title "5 lowest energy models" axes x1y1, \\ "{tsv_path}" index 3 using ($2):($3) with points ls 5 title "top 5 best model" axes x1y1 set yrange [:{third_quartile}] set output "{pdf_path_75}" set xrange [0:] plot "{tsv_path}" index 1 using ($2):($3) with points ls 2 title "75% lowest-scoring models" axes x1y1, \\ "{tsv_path}" index 2 using ($2):($3) with points ls 4 title "5 lowest energy models" axes x1y1, \\ "{tsv_path}" index 3 using ($2):($3) with points ls 5 title "top 5 best model" axes x1y1 ''' with open(gnu_path, 'w') as file: file.write(gnuplot_script.format(**locals())) utilities.run_gnuplot(gnu_path, verbose=self.verbose) return pdf_path_100, pdf_path_75
def save(idstr, tractor, nlscale=1., debug=False, plotAll=False, imgi=0, chilo=-10., chihi=10., roi=None): #print "Index: ", imgi mod = tractor.getModelImage(imgi) chi = tractor.getChiImage(imgi=imgi) synthfn = 'synth-%s.fits' % idstr print('Writing synthetic image to', synthfn) fitsio.write(synthfn, mod, clobber=True) pfn = 'tractor-%s.pickle' % idstr print('Saving state to', pfn) pickle_to_file(tractor, pfn) plt.clf() plt.hist(chi.ravel(), range=(-10,10), bins=100) plt.savefig('chi2.png') timg = tractor.getImage(imgi) data = timg.getImage() print('Mod type:', mod.dtype) print('Chi type:', chi.dtype) print('Data type:', data.dtype) zr = timg.zr print('zr', zr) # Set up nonlinear mapping based on the statistics of the data image. #sigma = np.median(timg.getInvError()) #print 'sigma', sigma ima = dict(interpolation='nearest', origin='lower') if nlscale == 0.: ima.update(vmin=zr[0], vmax=zr[1]) else: print(data.shape) q1,q2,q3 = np.percentile(data.ravel(), [25, 50, 75]) print('Data quartiles:', q1, q2, q3) ima.update(norm = ArcsinhNormalize(mean=q2, std=(1./nlscale) * (q3-q1)/2., vmin=zr[0], vmax=zr[1])) if roi is not None: ima.update(extent=roi) imchi = ima.copy() if nlscale == 0.: imchi.update(vmin=chilo, vmax=chihi, norm=None) else: imchi.update(norm = ArcsinhNormalize(mean=0., std=1./nlscale, vmin=chilo, vmax=chihi)) imdiff = ima.copy() dzr = (zr[1] - zr[0])/2. if nlscale == 0.: imdiff.update(vmin=-dzr, vmax=+dzr, norm=None) else: imdiff.update(norm = ArcsinhNormalize(mean=0., std=1./nlscale, vmin=-dzr, vmax=dzr)) if debug: sources = tractor.getCatalog() wcs = timg.getWcs() allobjx = [] allobjy = [] allobjc = [] pointx = [] pointy = [] xplotx = [] xploty = [] for obj in sources: if (isinstance(obj,PointSource)): xt,yt = wcs.positionToPixel(obj.getPosition(), obj) pointx.append(xt) pointy.append(yt) continue print(type(obj)) shapes = [] attrType = [] if (isinstance(obj,st.CompositeGalaxy)): for attr in 'shapeExp', 'shapeDev': shapes.append(getattr(obj, attr)) attrType.append(attr) else: shapes.append(getattr(obj,'shape')) attrType.append(' ') x0,y0 = wcs.positionToPixel(obj.getPosition(), obj) cd = timg.getWcs().cdAtPixel(x0,y0) print("CD",cd) for i,shape in enumerate(shapes): xplotx.append(x0) xploty.append(y0) T=np.linalg.inv(shape.getTensor(cd)) print("Inverted tensor:",T) print(obj.getPosition()) print(i) x,y = [],[] for theta in np.linspace(0,2*np.pi,100): ux = np.cos(theta) uy = np.sin(theta) dx,dy = np.dot(T,np.array([ux,uy])) x.append(x0+dx) y.append(y0+dy) allobjx.append(x) allobjy.append(y) if (attrType[i] == 'shapeExp'): allobjc.append('b') elif attrType[i] == 'shapeDev': allobjc.append('g') else: allobjc.append('r') def savepng(pre, img, title=None, **kwargs): fn = '%s-%s.png' % (pre, idstr) print('Saving', fn) plt.clf() plt.imshow(img, **kwargs) ax = plt.axis() if debug: print(len(xplotx),len(allobjx)) for i,(objx,objy,objc) in enumerate(zip(allobjx,allobjy,allobjc)): plt.plot(objx,objy,'-',c=objc) tempx = [] tempx.append(xplotx[i]) tempx.append(objx[0]) tempy = [] tempy.append(xploty[i]) tempy.append(objy[0]) plt.plot(tempx,tempy,'-',c='purple') plt.plot(pointx,pointy,'y.') plt.plot(xplotx,xploty,'xg') plt.axis(ax) if title is not None: plt.title(title) plt.colorbar() plt.gray() plt.savefig(fn) savepng('data', data, title='Data ' + timg.name, **ima) savepng('model', mod, title='Model ' + timg.name, **ima) savepng('diff', data - mod, title='Data - Model, ' + timg.name, **imdiff) savepng('chi', chi, title='Chi ' + timg.name, **imchi) if plotAll: debug = False for i,src in enumerate(tractor.getCatalog()): savepng('data-s%i'%(i+1),data - sky, title='Data '+timg.name,**ima) modelimg = tractor.getModelImage(timg, srcs=[src]) savepng('model-s%i'%(i+1), modelimg - sky, title='Model-s%i'%(i+1),**ima) savepng('diff-s%i'%(i+1), data - modelimg, title='Model-s%i'%(i+1),**imdiff) savepng('chi-s%i'%(i+1),tractor.getChiImage(imgi,srcs=[src]),title='Chi',**imchi)
range_spread_cc_list = np.empty((0, k), float) for test_sample in range(0, distances_order.shape[0]): range_spread = y_train['Spread bps'].iloc[distances_order.iloc[ test_sample, :].values] range_spread_cc = y_train['Spread bps'].iloc[ d_closest_cluster.iloc[test_sample, :].values] range_spread_list = \ np.vstack((range_spread_list, range_spread)) range_spread_cc_list = \ np.vstack((range_spread_cc_list, range_spread_cc)) ############ df_range_spread = pd.DataFrame(range_spread_list) df_range_spread_cc = pd.DataFrame(range_spread_cc_list) df_range_spread_75p = df_range_spread.apply( lambda x: np.percentile(x, 75), axis=1) df_range_spread_50p = df_range_spread.apply( lambda x: np.percentile(x, 50), axis=1) df_range_spread_25p = df_range_spread.apply( lambda x: np.percentile(x, 25), axis=1) df_range_spread_75p_include_pred = np.column_stack( (df_range_spread_75p, predictions)).max(axis=1) df_range_spread_25p_include_pred = np.column_stack( (df_range_spread_25p, predictions)).min(axis=1) df_coverage_real_spread = \ sum((df_range_spread_25p < y_test['Spread bps'].values) & (df_range_spread_25p < y_test['Spread bps'].values))/n_test_samples df_coverage_predicted_spread = \ sum((df_range_spread_25p < predictions) &
#plotting the cumulative distribution (_cdf) plt.hist(sep, bins, prange, color='cyan', histtype='step', rwidth=2, cumulative=True, density=True, label='BCG - SZ') print(np.median(sep)) print(np.median(sep1)) print(np.median(simuk)) print(np.percentile( sep1, .1, )) print(np.percentile( sep, .1, )) print(np.percentile( simuk, .1, )) # x-axis label plt.xlabel('Seperation Value (kpc)') # frequency labe plt.ylabel('Cumulitive Value (%)') # plot title
def raw_chunkify_with_remap_main(args): """ Main function for `chunkify.py raw_remap` producing batch file for model training """ if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) if os.path.exists(args.output_strand_list): print("Cowardly refusing to overwrite {}".format( args.output_strand_list)) sys.exit(2) fast5_files = fast5.iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) references = util.fasta_file_to_dict(args.references) print('* Processing data using', args.jobs, 'threads') kwarg_names = [ 'trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor', 'interpolation', 'open_pore_fraction' ] kwargs = util.get_kwargs(args, kwarg_names) kwargs['references'] = references i = 0 compiled_file = helpers.compile_model(args.model, args.compile) output_strand_list_entries = [] bad_list = [] chunk_list = [] label_list = [] with open(args.output_strand_list, 'w') as slfh: slfh.write(u'\t'.join([ 'filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end' ]) + u'\n') for res in imap_mp( raw_chunk_remap_worker, fast5_files, threads=args.jobs, fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker, initargs=[compiled_file, args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) read, score, nblocks, path, seq, chunks, labels, bad_ev = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) strand_data = [ read, nblocks, -score / nblocks, np.sum(np.ediff1d(path, to_begin=1) == 0), len(seq), min(path), max(path) ] slfh.write('\t'.join([str(x) for x in strand_data]) + '\n') if compiled_file != args.compile: os.remove(compiled_file) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'downsample_factor': args.downsample_factor, 'input_type': 'raw', 'interpolation': args.interpolation, 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'section': 'template', 'trim': args.trim, 'alphabet': args.alphabet, } blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list]) blanks = np.percentile(blanks_per_chunk, args.blanks_percentile) util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def binning_data_split(df, var, global_bt, global_gt, min_sample, alpha=0.01): """ Specify the data split level and return the split value list :return: """ iv_var = InfoValue() # Calculates the IV of the current node before splitted gd = calulate_iv(df, var, global_bt, global_gt) woei, ivi = gd['woei'], gd['ivi'] if np.unique(df[var]).__len__() <= 8: # print 'running into if' split = list(np.unique(df[var])) split.sort() # print 'split:',split #Segmentation point checking and processing split = check_point(df, var, split, min_sample) split.sort() # print 'after check:',split iv_var.split_list = split return node(split_point=split, iv=ivi) percent_value = list(np.unique(np.percentile(df[var], range(100)))) percent_value.sort() if percent_value.__len__() <= 2: iv_var.split_list = list(np.unique(percent_value)).sort() return node(split_point=percent_value, iv=ivi) # A sentry that attempts to split the current node # Init bestSplit_iv with zero bestSplit_iv = 0 bestSplit_woel = [] bestSplit_woer = [] bestSplit_ivl = 0 bestSplit_ivr = 0 bestSplit_point = [] #remove max value and min value in case dataset_r or dataset_l will be null for point in percent_value[0:percent_value.__len__() - 1]: # If there is only a sample or a negative sample, skip if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \ or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample : continue woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split( df, var, point, global_bt, global_gt) if iv > bestSplit_iv: bestSplit_woel = woel bestSplit_woer = woer bestSplit_iv = iv bestSplit_point = point bestSplit_dataset_r = dataset_r bestSplit_dataset_l = dataset_l bestSplit_ivl = ivl bestSplit_ivr = ivr # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive # specified step learning rate 0.01 if bestSplit_iv > ivi * (1 + alpha) and bestSplit_dataset_r.shape[ 0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample: presplit_right = node() presplit_left = node() # Determine whether the right node satisfies the segmentation prerequisite if bestSplit_dataset_r.shape[0] < min_sample or set( bestSplit_dataset_r['target']).__len__() == 1: presplit_right.iv = bestSplit_ivr right = presplit_right else: right = binning_data_split(bestSplit_dataset_r, var, global_bt, global_gt, min_sample, alpha=0.01) # Determine whether the left node satisfies the segmentation prerequisite if bestSplit_dataset_l.shape[0] < min_sample or np.unique( bestSplit_dataset_l['target']).__len__() == 1: presplit_left.iv = bestSplit_ivl left = presplit_left else: left = binning_data_split(bestSplit_dataset_l, var, global_bt, global_gt, min_sample, alpha=0.01) return node(var_name=var, split_point=bestSplit_point, iv=ivi, left=left, right=right) else: # Returns the current node as the final leaf node return node(var_name=var, iv=ivi)
def RSF_bootstrap(fp, num=False): df = pd.read_csv(fp, index_col=0) # configure bootstrap (sampling 50% of data) n_iterations = 100 n_size = int(len(df) * 0.50) # parameters NUMESTIMATORS = 100 TESTSIZE = 0.20 random_state = 20 # calculate population of statistics metrics = [] for i in range(n_iterations): # prepare sample # if indicated, include number of mets (col 42) if num: sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]], n_samples=n_size) X = sample.iloc[:, np.r_[:20, 42]].copy() else: sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size) X = sample.iloc[:, :20].copy() X = X.to_numpy().astype('float64') y = sample[['Event', 'Time']].copy() y['Event'] = y['Event'].astype('bool') y['Time'] = y['Time'].astype('float64') y = y.to_records(index=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TESTSIZE, random_state=random_state) rsf = RandomSurvivalForest(n_estimators=NUMESTIMATORS, min_samples_split=15, min_samples_leaf=8, max_features="sqrt", n_jobs=-1, random_state=random_state) rsf.fit(X_train, y_train) score = rsf.score(X_test, y_test) metrics.append(score) # calculate confidence interval alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(metrics, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(metrics, p)) med = np.percentile(metrics, 50) # identify aggregation method name if num: name = fp.split('/')[-1].split('_')[0] + ' + NumMets' else: name = fp.split('/')[-1].split('_')[0] return print(name, 'RSF', '%.3f (%.3f-%.3f)' % (med, lower, upper))
def __init__(self, audio_dir, sample_rate, speakers_sub_list=None): self.audio_dir = os.path.expanduser(audio_dir) # for the ~/ self.sample_rate = sample_rate self.metadata = dict() # small cache <SPEAKER_ID -> SENTENCE_ID, filename> self.cache = dict() # big cache <filename, data:audio librosa, blanks.> logger.debug('Initializing AudioReader()') logger.debug('audio_dir = {}'.format(self.audio_dir)) logger.debug('sample_rate = {}'.format(sample_rate)) logger.debug('speakers_sub_list = {}'.format(speakers_sub_list)) st = time() if len(find_files(TMP_DIR, pattern='*.pkl')) == 0: # generate all the pickle files. logger.debug('Nothing found at {}. Generating all the caches now.'.format(TMP_DIR)) files = find_files(self.audio_dir) assert len(files) != 0, 'Generate your cache please.' logger.debug('Found {} files in total in {}.'.format(len(files), self.audio_dir)) if speakers_sub_list is not None: files = list( filter(lambda x: any(word in extract_speaker_id(x) for word in speakers_sub_list), files)) logger.debug('{} files correspond to the speaker list {}.'.format(len(files), speakers_sub_list)) assert len(files) != 0 bar = tqdm(files) for filename in bar: bar.set_description(filename) try: speaker_id = extract_speaker_id(filename) audio, _ = read_audio_from_filename(filename, self.sample_rate) energy = np.abs(audio[:, 0]) silence_threshold = np.percentile(energy, 95) offsets = np.where(energy > silence_threshold)[0] left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms) right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate # _, left_blank, right_blank = trim_silence(audio[:, 0], silence_threshold) logger.info('_' * 100) logger.info('left_blank_duration_ms = {}, right_blank_duration_ms = {}, ' 'audio_length = {} frames, silence_threshold = {}'.format(left_blank_duration_ms, right_blank_duration_ms, len(audio), silence_threshold)) obj = {'audio': audio, 'audio_voice_only': audio[offsets[0]:offsets[-1]], 'left_blank_duration_ms': left_blank_duration_ms, 'right_blank_duration_ms': right_blank_duration_ms, FILENAME: filename} cache_filename = filename.split('/')[-1].split('.')[0] + '_cache' tmp_filename = os.path.join(TMP_DIR, cache_filename) + '.pkl' with open(tmp_filename, 'wb') as f: dill.dump(obj, f) logger.debug('[DUMP AUDIO] {}'.format(tmp_filename)) # commit to metadata dictionary when you're sure no errors occurred during processing. if speaker_id not in self.metadata: self.metadata[speaker_id] = {} sentence_id = extract_sentence_id(filename) if sentence_id not in self.metadata[speaker_id]: self.metadata[speaker_id][sentence_id] = [] self.metadata[speaker_id][sentence_id] = {SPEAKER_ID: speaker_id, SENTENCE_ID: sentence_id, FILENAME: filename} except librosa.util.exceptions.ParameterError as e: logger.error(e) logger.error('[DUMP AUDIO ERROR SKIPPING FILENAME] {}'.format(filename)) dill.dump(self.metadata, open(os.path.join(TMP_DIR, 'metadata.pkl'), 'wb')) logger.debug( 'Using the generated files at {}. Using them to load the cache. Be sure to have enough memory.'.format( TMP_DIR)) self.metadata = dill.load(open(os.path.join(TMP_DIR, 'metadata.pkl'), 'rb')) pickle_files = find_files(TMP_DIR, pattern='*.pkl') for pkl_file in tqdm(pickle_files, desc='reading cache'): if 'metadata' not in pkl_file: with open(pkl_file, 'rb') as f: obj = dill.load(f) self.cache[obj[FILENAME]] = obj logger.debug('Cache took {0:.2f} seconds to load. {1:} keys.'.format(time() - st, len(self.cache)))
nsources = [] TS = [] beta = (0.5) #For background ts TS_beta = [] #Calculated from the total TS median after we get all the TS. beta_err = [] gamma = [] for file in files: for item in range(len(file['n_inj'])): n_inj.append(file['n_inj'][item]) nsources.append(file['nsources'][item]) TS.append(file['TS'][item]) gamma.append(file['gamma'][item]) TSs = file['TS'] TS_beta = np.percentile(TSs, 100. * (1. - beta)) m = np.count_nonzero(np.asarray(TSs) > (TS_beta)) i = len(TSs) fraction = float(m) / float(i) beta_err = (np.sqrt(fraction * (1. - fraction) / float(i)) if 0 < beta < 1 else 1.) ##Now we have all the pieces of the original dictionary. Time to glue bckg_trials back in place, in their proper file type.## bckg_trials = { 'n_inj': n_inj, 'nsources': np.asarray(nsources), 'TS': np.asarray(TS), 'beta': beta, 'beta_err': beta_err, 'TS_beta': TS_beta, 'gamma': np.asarray(gamma)
def creative_boxplot(ax: matplotlib.axes, data: List[np.ndarray or List[int or float]] or np.ndarray, bins: int = 10, whis: float = 1.5, labelset: list or bool = False, showcaps: bool = True, showfliers: bool = True, showmeans: bool = True, showtrend: bool = True, variawidth: bool = True, curfacecolor: str = 'white', curlinecolor: str = 'black', curalpha: int = 1, outlierlinecolor: str = 'white', outliercolor: str = 'steelblue', outlierlinewidth: int = 1, capcolor: str = 'black', capwidth: int or float = 1, whiskercolor: str = 'black', whiskerwidth: int or float = 1, boxfacecolor: str = 'white', boxedgecolor: str = 'black', boxedgewidth: int or float = 1, mediancolor: str = 'orange', medianwidth: int or float = 1, medianlinestyle: str = '-', meancolor: str = 'green', meanwidth: int or float = 1, meanlinestyle: str = '--', trendcolor: str = 'blue', trendwidth: int or float = 1.5, trendlinestyle: str = ':', rotation: int or float = 0) -> matplotlib.axes: """ Make a creative mixed plot with various properties assignable, such as color, width and line style. The box plot is on the left half and the frequency area is on the right side. Make a box and whisker plot for each data set in the data list. The box extends from the lower to upper quartile values of the data, with a line at the median. The whiskers extend from the box to show the range of the data. Outliers are those past the end of the whiskers. It allows users to specify the face color of the box and the outliers, the line color of the box, the whisker, the caps, the outliers, and the median. It also allows users to specify whether to show the caps, outliers, means, and the line among boxes. Users can also set the labels of datasets. There are other properties such as line width and line style that are able to be specified. Besides, users can set the widths of the boxs changeable to make it reflect the size of the samples when comparing grouped data. When used for time series data, dotted line between the boxes can be specified to show the variation trends of the median among the samples. parameters: ax: matplotlib.axes data: List[np.ndarray or List[int or float]] or np.ndarray consists in a list of list and each item of data is a list containing multiple series of numerical values. bins: int, default: 10 whis: float, default: 1.5 The position of the whiskers. If a float, the lower whisker is at the lowest datum above Q1 - whis*(Q3-Q1), and the upper whisker at the highest datum below Q3 + whis*(Q3-Q1), where Q1 and Q3 are the first and third quartiles. The default value of whis = 1.5 corresponds to Tukey's original definition of boxplots. labelset: list, optional, default: [1,2,3,4,...] Labels for each dataset (one per dataset). showcaps: bool, default: True If True, show the caps on the ends of whiskers. showfliers: bool, default: True If True, show the outliers beyond the caps. showmeans: bool, default: True If True, show the arithmetic means. showtrend: bool, default: True If True, show the broken line among medians of datasets variawidth: bool, default: True If True, change the widths of boxes according to the sizes of datasets capcolor: color, default: 'black' The color of caps (horizontal lines at the ends of the whiskers) capwidth: float or int, default: 1 The width of caps (horizontal lines at the ends of the whiskers) whiskercolor: color, default: 'black' The color of whiskers (the vertical lines extending to the most extreme, non-outlier data points) whiskerwidth: float or int, default: 1 The width of whiskers (the vertical lines extending to the most extreme, non-outlier data points) boxfacecolor: color, default: 'white' The color of the faces of the boxes boxedgecolor: color, default: 'black' The color of the edges of the boxes boxedgewidth: float or int, default: 1 The width of the edges of the boxes mediancolor: color, default: 'orange' The color of the median lines in the boxes medianwidth: float or int, default: 1 The width of the median lines in the boxes medianlinestyle: str, default:'--' The line style of the median lines in the boxes '-': solid line style '--': dashed line style '-.': dash-dot line style ':': dotted line style meancolor: color, default: 'green' The color of the mean lines in the boxes meanwidth: float or int, default: 1 The width of the mean lines in the boxes meanlinestyle: str, default:'--' The line style of the mean lines in the boxes '-': solid line style '--': dashed line style '-.': dash-dot line style ':': dotted line style trendcolor: color, default: 'blue' The color of the line connecting the medians of the boxes trendwidth: float or int, default: 1.5 The width of the line connecting the medians of the boxes trendlinestyle: str, default:':' The line style of the line connecting the medians of the boxes '-': solid line style '--': dashed line style '-.': dash-dot line style ':': dotted line style curlinecolor: str, default: 'white' The color of edges of the curves curfacecolor: str, default: 'black' The color of faces of the curves curalpha: int, default: 1 The transparency of faces of the curves outliercolor: color, default: 'white' The color of the faces of points represent the outliers outlierlinecolor: color, default: 'black' The color of the edges of points represent the outliers outlierlinewidth: float or int, default: 1 The width of the edges of points represent the outliers Returns ------- matplotlib.axes """ try: bins += 0 except TypeError as err: print("The bins should be integer") raise err if isinstance(data, np.ndarray): assert len(data.shape) == 2, "The input should be 2-D array" assert data.dtype != '<U11', "The element in 2-D array should be numerical values" else: data = input_checking(data) # set x-axis and y-axis labels = [i + 1 for i in range(len(data))] y_min = min(min(data[i]) for i in range(len(data))) y_max = max(max(data[i]) for i in range(len(data))) ax.set_ylim(y_min - 0.1 * (abs(y_max)), y_max + 0.1 * (abs(y_max))) ax.set_xlim(0, len(labels) + 1) ax.set_xticks(labels) if labelset: ax.set_xticklabels(labelset, rotation=rotation) proportion = [] for index in data: proportion.append(len(index)) # set a box for each list of data for index in range(len(data)): # set the width of the box and caps if variawidth: width = 0.5 * (proportion[index] / sum(proportion)) else: width = 0.25 # get the quantiles quantiles = np.percentile(data[index], (25, 50, 75)) iqr = quantiles[2] - quantiles[0] # the lower bound of the box low_bound = quantiles[0] - whis * iqr # the upper bound of the box up_bound = quantiles[2] + whis * iqr # define the top of box box_top = min(max(data[index]), up_bound) # define the bottom of box box_bottom = max(min(data[index]), low_bound) height = max(data[index]) - min(data[index]) ax.vlines(labels[index], ymin=min(data[index]), ymax=max(data[index]), linewidth=1) inter = height / bins barwidth = height / bins total = [] low = min(data[index]) yli = [] xli = [] for m in range(bins): count = 0 for n in data[index]: if n >= low and n < low + inter: count += 1 low += inter if m == bins - 2: low += 1 # take the maximum value into consideration total.append(count) total = [(x - min(total)) / (max(total) - min(total)) * 0.5 for x in total] # scaler to(0,0.5) for p in range(len(total)): yli.append(min(data[index]) + p * barwidth + barwidth / 2) xli.append(total[p] + labels[index]) xli.insert(0, labels[index]) xli.append(labels[index]) yli.insert(0, box_bottom) yli.append(box_top) from operator import itemgetter yli, xli = [ list(x) for x in zip(*sorted(zip(yli, xli), key=itemgetter(0))) ] y = np.array(yli) ynew = np.linspace(min(y), max(y), 1000) from scipy.interpolate import make_interp_spline power_smooth = make_interp_spline(yli, xli, bc_type=([(1, 0.0)], [(1, 0.0) ]))(ynew) for t in range(len(power_smooth)): if power_smooth[t] < labels[index]: power_smooth[t] = labels[index] ax.fill_betweenx(ynew, labels[index], power_smooth, facecolor=curfacecolor, edgecolor=curlinecolor, alpha=curalpha) rect = plt.Rectangle((labels[index] - width, quantiles[0]), width, quantiles[2] - quantiles[0], color=boxfacecolor) ax.add_patch(rect) # pick out and draw the outliers outliers = np.concatenate((data[index][low_bound > data[index]], data[index][up_bound < data[index]])) for o in outliers: if showfliers: trans = (ax.figure.dpi_scale_trans + transforms.ScaledTranslation(labels[index], o, ax.transData)) circle = matplotlib.patches.Circle((0, 0), 0.04, edgecolor=outlierlinecolor, facecolor=outliercolor, transform=trans, linewidth=outlierlinewidth) ax.add_patch(circle) # do not consider outliers when drawing the boxplot data[index] = data[index][~np.isin(data[index], o)] # draw the bottom of box ax.hlines(quantiles[0], labels[index] - width, labels[index], linewidth=boxedgewidth, color=boxedgecolor) # draw the median of box ax.hlines( quantiles[1], labels[index] - width, labels[index], color=mediancolor, linewidth=medianwidth, ls=medianlinestyle, ) # draw the top of box ax.hlines(quantiles[2], labels[index] - width, labels[index], linewidth=boxedgewidth, color=boxedgecolor) if showcaps: # draw the high cap ax.hlines(box_top, labels[index] - width / 2, labels[index] + width / 2, linewidth=capwidth, color=capcolor) # draw the low cap ax.hlines(box_bottom, labels[index] - width / 2, labels[index] + width / 2, linewidth=capwidth, color=capcolor) # draw the low whisker ax.vlines(labels[index], ymin=box_bottom, ymax=quantiles[0], linewidth=whiskerwidth, color=whiskercolor) # draw the high whisker ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=whiskerwidth, color=whiskercolor) # draw the left bound of whisker ax.vlines(labels[index] - width, ymin=quantiles[0], ymax=quantiles[2], linewidth=boxedgewidth, color=boxedgecolor) if showtrend: if index > 0: verts = [ (labels[index - 1], lastme), (labels[index], quantiles[1]), ] codes = [ Path.MOVETO, Path.LINETO, ] path = Path(verts, codes) patch = patches.PathPatch(path, color=trendcolor, ls=trendlinestyle, lw=trendwidth) ax.add_patch(patch) lastme = quantiles[1] if showmeans: ax.hlines(np.mean(data[index]), labels[index] - width, labels[index], color=meancolor, ls=meanlinestyle, linewidth=meanwidth) return ax
def plot(self, background=False): # capval_percentile = 95% kwargs1 = {} kwargs2 = {} if 'fc' in self.plot_kwargs: kwargs1['fc'] = self.plot_kwargs['fc'] if 'alpha' in self.plot_kwargs: kwargs1['alpha'] = self.plot_kwargs['alpha'] if 'label_fontsize' in self.plot_kwargs: kwargs2['fontsize'] = self.plot_kwargs['label_fontsize'] else: kwargs2['fontsize'] = 9 if 'label' in self.plot_kwargs: label = self.plot_kwargs['label'] else: label = self.df.columns[3] ly = self.layout if ly.chrms_plot is None: vals = [v for k,(c,beg,end,v) in self.df.iterrows()] else: vals = [v for k,(c,beg,end,v) in self.df.iterrows() if c in ly.chrms_plot] minval = np.min(vals) maxval = np.max(vals) if 'capval_percentile' in self.plot_kwargs: capval = np.percentile(vals, self.plot_kwargs['capval_percentile']) else: capval = maxval valrange = float(capval - minval) angle1s = [] anglewids = [] heights = [] for k, (chrm, beg, end, value) in self.df.iterrows(): if chrm not in ly.chrm2angles: continue if value > capval: value = capval angle1 = ly.loc2angle(chrm, beg) anglewid = ly.loc2angle(chrm, end) - angle1 angle1s.append(angle1) anglewids.append(anglewid) heights.append(self.track_height*(value-minval)/valrange*self.maxr) # ha='center', va='center', rotation=normalize_text_angle(text_angle/(np.pi*2)*360,tangent=True), # plotting background bg_start = [] bg_width = [] bg_height = [] if background: for chrm in ly.chrm2angles: angle_beg = ly.loc2angle(chrm,0) angle_end = ly.loc2angle(chrm,ly.chrm2len[chrm]) bg_start.append(angle_beg) bg_width.append(angle_end-angle_beg) bg_height.append(self.track_height*self.maxr) ly.ax.bar(bg_start, bg_height, bg_width, bottom=self.track_bottom, ec='none', fc='grey', align='edge', alpha=0.1) # plot data ly.ax.bar(angle1s, heights, anglewids, bottom=self.track_bottom, ec='none', align='edge', **kwargs1) if 'labelside_text_angle' in self.plot_kwargs: text_angle = self.plot_kwargs['labelside_text_angle'] / 180.0 * np.pi else: text_angle = ly.angle_beg if 'label' in self.plot_kwargs: ly.ax.text(text_angle, self.track_bottom + self.track_height/2.0, label, fontname=ly.fontname, va='center', ha='left', **kwargs2) if 'labelside_circle' in self.plot_kwargs: fc = self.plot_kwargs['fc'] if 'fc' in self.plot_kwargs else 'r' if 'labelside_circ_angle' in self.plot_kwargs: circ_angle = self.plot_kwargs['labelside_circ_angle'] else: circ_angle = ly.angle_beg ly.ax.add_artist(mpatches.Circle( polar2cart(circ_angle, self.track_bottom), self.plot_kwargs['labelside_circle'], edgecolor=fc, facecolor=fc, alpha=0.4, lw=0.1, transform=ly.ax.transData._b))
def histobox_plot(ax: matplotlib.axes, data: List[np.ndarray or List[int or float]] or np.ndarray, bins: int = 10) -> matplotlib.axes: """ Drawing function for plot which is a mix between a box plot and a histogram Drawing a mixed plot for each data set in the data list. The left half is a traditional box plot, while there is a histogram reflecting the distribution on the right half. Parameters ---------- ax: matplotlib.axes data: List[np.ndarray or List[int or float]] or np.ndarray consists in a list of list and each item of data is a list containing multiple series of numerical values. bins: int, default: 10 Returns ------- matplotlib.axes """ # input checking try: bins += 0 except TypeError as err: print("The bins should be integer") raise err if isinstance(data, np.ndarray): assert len(data.shape) == 2, "The input should be 2-D array" assert data.dtype != '<U11', "The element in 2-D array should be numerical values" else: data = input_checking(data) # set x-axis and y-axis labels = [i + 1 for i in range(len(data))] ax.set_xticks(labels) y_min = min(min(data[i]) for i in range(len(data))) y_max = max(max(data[i]) for i in range(len(data))) ax.set_ylim(y_min - 0.1 * abs(y_max), y_max + 0.1 * (abs(y_max))) ax.set_xlim(0, len(labels) + 1) # set a box for each list of data for index in range(len(data)): # set the width of the box and caps width = 0.2 # get the quantiles quantiles = np.percentile(data[index], (25, 50, 75)) iqr = quantiles[2] - quantiles[0] # the lower bound of the box low_bound = quantiles[0] - 1.5 * iqr # the upper bound of the box up_bound = quantiles[2] + 1.5 * iqr # deal with the bar plot height = max(data[index]) - min(data[index]) ax.vlines(labels[index], ymin=min(data[index]), ymax=max(data[index]), linewidth=1) inter = height / bins barwidth = height / bins total = [] low = min(data[index]) for m in range(bins): count = 0 for n in data[index]: if n >= low and n < low + inter: count += 1 low += inter # take the maximum value into consideration if m == bins: low += 1 total.append(count) # scaler to(0,0.5) total = [(x - min(total)) / (max(total) - min(total)) * 0.5 for x in total] for p in range(len(total)): rect = plt.Rectangle( (labels[index], min(data[index]) + p * barwidth), total[p], barwidth, edgecolor='black', facecolor='silver') ax.add_patch(rect) # pick out and draw the outliers outliers = np.concatenate((data[index][low_bound > data[index]], data[index][up_bound < data[index]])) for o in outliers: trans = ( ax.figure.dpi_scale_trans + transforms.ScaledTranslation(labels[index], o, ax.transData)) circle = matplotlib.patches.Circle((0, 0), 0.04, edgecolor='black', facecolor='white', transform=trans) ax.add_patch(circle) # do not consider outliers when drawing the boxplot data[index] = data[index][~np.isin(data[index], o)] # draw the whisker,caps and box # define the top of box box_top = min(max(data[index]), up_bound) # define the bottom of box box_bottom = max(min(data[index]), low_bound) # draw the bottom of box ax.hlines(quantiles[0], labels[index] - width, labels[index], linewidth=1) # draw the median of box ax.hlines(quantiles[1], labels[index] - width, labels[index], linewidth=1) # draw the top of box ax.hlines(quantiles[2], labels[index] - width, labels[index], linewidth=1) # draw the high cap ax.hlines(box_top, labels[index] - width / 2, labels[index], linewidth=1) # draw the low cap ax.hlines(box_bottom, labels[index] - width / 2, labels[index], linewidth=1) # draw the low whisker ax.vlines(labels[index], ymin=box_bottom, ymax=quantiles[0], linewidth=1) # draw the high whisker ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=1) # draw the left bound of whisker ax.vlines(labels[index] - width, ymin=quantiles[0], ymax=quantiles[2], linewidth=1) ax.set_xlim(0, len(labels) + 1) return ax
def find_bright_spots(image, n_clusters=3, blur_radius=21, amount_of_bright_parts=0.8, return_all_pos=False): """ Find the indices location of the top-k brightest spots in an color image. :param image: input image. Must be an mutli-channel RGB color image :type image: numpy.ndarray :param n_clusters: expected number of clusters/brightest spots in the input image :type n_clusters: int :param blur_radius: radius of the Gaussian blur kernel that used to smooth the image :type blur_radius: int :param amount_of_bright_parts: amount of bright parts in an image. Used to find the lower bound for \ distinguishing the bright and non-bright part of the input image. Range of amount_of_bright_parts is \ in [0, 1] (all non-bright -> all bright) :type amount_of_bright_parts: float :return: The location of centers of top-3 bright spots (with irregular shape), percentage of dominance of each spot \ (relative size of the spot) :rtype: (numpy.ndarray, numpy.ndarray) """ assert amount_of_bright_parts >= 0 and amount_of_bright_parts <= 1, "Range of the sample ration is in [0, 1]" assert n_clusters >= 1, "The number of bright spots must be larger or equal to 1" amount_of_bright_parts = amount_of_bright_parts * 100 # Convert BRG to Greyscale grayscale_img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) # Blur the image with radius = blurRadius blurred_img = cv2.GaussianBlur(grayscale_img, (blur_radius, blur_radius), 0) # Compute the lower bound threshold lower_bound = np.percentile(blurred_img, 100 - amount_of_bright_parts) - 1 # Threshold the imgae by setting any pixel with value larger than lower bound to 255 threshed_img = cv2.threshold(blurred_img, lower_bound, 255, cv2.THRESH_BINARY)[1] # Purifiy the edges of the brightest spots. threshed_img = cv2.erode(threshed_img, None, iterations=2) threshed_img = cv2.dilate(threshed_img, None, iterations=4) # Get the location of all white pixel in binary threshold image locs = np.argwhere(threshed_img == 255) try: # convert to np.float32 locs_above_threshold = np.float32(locs) # define criteria and apply kmeans() criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0) ret, label, center = cv2.kmeans(locs_above_threshold, n_clusters, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS) # Compute the percentage of each clusters in the image. percent_of_dominance = compute_percents_of_labels(label) if return_all_pos: return label, locs_above_threshold.astype( "int32"), percent_of_dominance else: return center.astype("int32"), percent_of_dominance except: # Catch exception return negative array. if return_all_pos: return np.array([-1]), np.array([-1]), np.array([-1]) else: return np.ones(shape=(n_clusters, 1)) * -1, np.ones(shape=(n_clusters, 1)) * -1
def combine_tracks(self, plot=True, overwrite=False, search_radius=8, min_duration=8, min_duration_in_start_area=3, propagation_speed=-2, propagation_length=10, lat_restriction=[5, 35]): out_file = self._working_dir + str(self._identifier) + '_track_info.nc' if overwrite and os.path.isfile(out_file): os.system('rm ' + out_file) os.system('rm ' + self._working_dir + 'track_path/' + str(self._identifier) + '_*_*.png') elif overwrite == False and os.path.isfile(out_file): self._aews = da.read_nc(out_file) return self._aews def unit_vector(vector): return vector / np.linalg.norm(vector) def angle_between(v1, v2): if sum(v1) == 0 or sum(v2) == 0: return 0 v1_u = unit_vector(v1) v2_u = unit_vector(v2) return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) def v_len(v): return sum([zz**2 for zz in v])**0.5 # convert distances from degrees into grid-cells search_radius = self.degree_to_step(search_radius) propagation_length = self.degree_to_step(propagation_length) found_id = 0 found_tracks = {} postions = self._detected.copy().values used_pos = [] print('combining tracks\n10------50-------100') for p_0, progress in zip( postions.tolist(), np.array([['-'] + [''] * (len(postions.tolist()) / 20 + 1)] * 20).flatten()[0:len(postions.tolist())]): sys.stdout.write(progress) sys.stdout.flush() track = [p_0, p_0] running = True #go backwards while True: p = track[0] p__ = track[1] candidates = {} v_1 = ((p[1] - p__[1]), (p[2] - p__[2])) for p_1 in postions[postions[:, 0] == p[0] - 1, :].tolist(): v_2 = ((p_1[1] - p[1]), (p_1[2] - p[2])) if v_len(v_2) < search_radius: candidates[v_len(v_2) * angle_between(v_1, v_2)] = p_1 end = False if len(candidates.keys()) > 0: track = [candidates[np.min(candidates.keys())]] + track else: break #go forewards while True: p = track[-1] p__ = track[-2] candidates = {} v_1 = ((p[1] - p__[1]), (p[2] - p__[2])) for p_1 in postions[postions[:, 0] == p[0] + 1, :].tolist(): v_2 = ((p_1[1] - p[1]), (p_1[2] - p[2])) if v_len(v_2) < search_radius: candidates[v_len(v_2) * angle_between(v_1, v_2)] = p_1 end = False if len(candidates.keys()) > 0: track = track + [candidates[np.min(candidates.keys())]] else: break track.remove(p_0) track = da.DimArray(track, axes=[np.array(track)[:, 0], self._detected.z], dims=['time', 'z']) # conditions: keep = True # duration if len(track.time) < min_duration: keep = False # duration in start domain if min_duration_in_start_area > 1 and keep: start_of_track = track[ self._lons[np.array(track[:, 'y'], int), np.array(track[:, 'x'], int)] > -40] if len(start_of_track.time) < min_duration_in_start_area: keep = False # propagation speed (in starting domain?) if propagation_speed is not None and len(track.time) > 1 and keep: c = np.array([ self.step_to_distance(track[i - 1, 'y'], track[i, 'y'], track[i - 1, 'x'], track[i, 'x'])[0] / (6. * 60. * 60) for i in track.time[1:] if i - 1 in track.time ]) if np.percentile(c, 50) > propagation_speed: kepp = False # propagation length if propagation_length is not None and len(track.time) > 1 and keep: if abs(self._lons[int(track.ix[-1, 1]), int(track.ix[-1, 2])] - self._lons[int(track.ix[0, 1]), int(track.ix[0, 2])]) < propagation_length: keep = False # lat genesis restriction if lat_restriction is not None and keep: if self._lats[np.array(track.ix[0, 1], int), np.array(track.ix[0, 2], int)] > max( lat_restriction) or self._lats[ np.array(track.ix[0, 1], int), np.array(track.ix[0, 2], int)] < min( lat_restriction): keep = False # delete duplicates if keep: for id__, track__ in found_tracks.items(): if sum([ pp in track__.values.tolist() for pp in track.values.tolist() ]) / float(len(track.values.tolist())) != 0: if track[:, 'members'].sum() > track__[:, 'members'].sum(): found_tracks.pop(id__) break else: keep = False break if keep: found_tracks[found_id] = track found_id += 1 self._aews = {} self._id = 0 for track in found_tracks.values(): track = da.DimArray(track, axes=[np.array(track)[:, 0], self._detected.z], dims=['time', 'z']) self._aews[self._identifier + '_' + str(self._id)] = track if plot: self.plot_track_path(track) self._id += 1 self._aews = da.Dataset(self._aews) self._aews.write_nc(out_file, mode='w') print('\ndone') return self._aews
def info_boxplot_v1( ax: matplotlib.axes, data: List[np.ndarray or List[int or float]] or np.ndarray) -> matplotlib.axes: """ Drawing function for box plots. This is the 1st version of info_boxplot which satisfies the requirement 1. The box plots will be drawn by this function and the quartile 1, median, quartile 3, the lower fence (sometimes called “minimum”), and upper fence will be included. The outliers will be drawn as pointers on the plot. Parameters ---------- ax: matplotlib.axes.Axis data: list(list()), ...) consists in a list of list and each item of data is a list containing multiple series of numerical values Returns ------- matplotlib.axes """ # input checking if isinstance(data, np.ndarray): assert len(data.shape) == 2, "The input should be 2-D array" assert data.dtype != '<U11', "The element in 2-D array should be numerical values" else: data = input_checking(data) # set x-axis and y-axis labels = [i + 1 for i in range(len(data))] y_min = min(min(data[i]) for i in range(len(data))) y_max = max(max(data[i]) for i in range(len(data))) ax.set_ylim(y_min - 0.1 * (abs(y_max)), y_max + 0.1 * (abs(y_max))) ax.set_xlim(0, len(labels) + 1) ax.set_xticks(labels) # set a box for each list of data for index in range(len(data)): # set the width of the box and caps width = 0.2 # get the quantiles quantiles = np.percentile(data[index], (25, 50, 75)) iqr = quantiles[2] - quantiles[0] # the lower bound of the box low_bound = quantiles[0] - 1.5 * iqr # the upper bound of the box up_bound = quantiles[2] + 1.5 * iqr # pick out and draw the outliers outliers = np.concatenate((data[index][low_bound > data[index]], data[index][up_bound < data[index]])) for o in outliers: trans = ( ax.figure.dpi_scale_trans + transforms.ScaledTranslation(labels[index], o, ax.transData)) circle = matplotlib.patches.Circle((0, 0), 0.04, edgecolor='black', facecolor='white', transform=trans) ax.add_patch(circle) # do not consider outliers when drawing the boxplot data[index] = data[index][~np.isin(data[index], o)] # draw the whisker,caps and box # define the top of box box_top = min(max(data[index]), up_bound) # define the bottom of box box_bottom = max(min(data[index]), low_bound) # draw the bottom of box ax.hlines(quantiles[0], labels[index] - width, labels[index] + width, linewidth=1) # draw the median of box ax.hlines(quantiles[1], labels[index] - width, labels[index] + width, color='orange', linewidth=1) # draw the top of box ax.hlines(quantiles[2], labels[index] - width, labels[index] + width, linewidth=1) # draw the high cap ax.hlines(box_top, labels[index] - width / 2, labels[index] + width / 2, linewidth=1) # draw the low cap ax.hlines(box_bottom, labels[index] - width / 2, labels[index] + width / 2, linewidth=1) # draw the low whisker ax.vlines(labels[index], ymin=box_bottom, ymax=quantiles[0], linewidth=1) # draw the high whisker ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=1) # draw the left bound of whisker ax.vlines(labels[index] - width, ymin=quantiles[0], ymax=quantiles[2], linewidth=1) # draw the right bound of whisker ax.vlines(labels[index] + width, ymin=quantiles[0], ymax=quantiles[2], linewidth=1) return ax
def find_bad_by_correlation( self, corr_thresh=0.4, fraction_bad=0.1, corr_window_secs=1.0 ): """Detect channels that do not correlate well with the other channels. Divide the whole signal into windows and compute window wise correlations. If a channel has more than `fraction_bad` windows that have correlate less than `corr_thresh` with the other channels, that channel is considered `bad_by_correlation`. The measure of correlation with other channels is defined as the 98th percentile of the absolute values of the correlations with the other channels in each window. Parameters ---------- corr_thresh : float The minimum correlation threshold that should be attained within a data window. fraction_bad : float If this percentage of all data windows in which the correlation threshold was not surpassed is exceeded, classify a channel as `bad_by_correlation`. corr_window_secs : float Width of the correlation window in seconds. """ # Based on the data, determine how many windows we need # and how large they should be correlation_frames = corr_window_secs * self.sfreq correlation_window = np.arange(0, correlation_frames) n = correlation_window.shape[0] correlation_offsets = np.arange( 0, (self.signal_len - correlation_frames), correlation_frames ) w_correlation = correlation_offsets.shape[0] # preallocate channel_correlations = np.ones((w_correlation, self.n_chans)) # Cut the data indo windows x_bp_window = self.x_bp[: self.n_chans, : n * w_correlation] x_bp_window = x_bp_window.reshape(self.n_chans, n, w_correlation) # Perform Pearson correlations across channels per window # For each channel, take the absolute of the 98th percentile of # correlations with the other channels as a measure of how well # correlated that channel is with the others. for k in range(w_correlation): eeg_portion = x_bp_window[:, :, k] window_correlation = np.corrcoef(eeg_portion) abs_corr = np.abs( (window_correlation - np.diag(np.diag(window_correlation))) ) channel_correlations[k, :] = np.percentile(abs_corr, 98, axis=0) # Perform thresholding to see which channels correlate badly with the # other channels in a certain fraction of windows (bad_time_threshold) thresholded_correlations = channel_correlations < corr_thresh frac_bad_corr_windows = np.mean(thresholded_correlations, axis=0) # find the corresponding channel names and return bad_idxs_bool = frac_bad_corr_windows > fraction_bad bad_idxs = np.argwhere(bad_idxs_bool) bads = self.ch_names[bad_idxs.astype(int)] bads = [i[0] for i in bads] bads.sort() self.bad_by_correlation = bads self._channel_correlations = channel_correlations return None
def info_boxplot_v3(ax: matplotlib.axes, data: List[np.ndarray or List[int or float]] or np.ndarray, facecolor: str = 'white', outliercolor: str = 'steelblue', boxlinecolor: str = 'black', whiskercolor: str = 'black', outlierlinecolor: str = 'white', capcolor: str = 'black', medianlinecolor: str = 'orange', multiplebox: bool = True) -> matplotlib.axes: """ Drawing function for box plots. This is the 3rd version of info_boxplot which satisfies the requirement 3. Based on the previous `info_boxplot`, the `info_boxplot_v3` can show every 5%-percentile from the 1st quartile (Q1) until the 3rd quartile (Q3). Parameters ---------- ax: matplotlib.axes.Axis data: List[np.ndarray or List[int or float]] or np.ndarray consists in a list of list and each item of data is a list containing multiple series of numerical values. facecolor: str, default: 'white' The color of the faces of boxes. outliercolor: str, default: 'steelblue' The color of points which represent outliers. outlierlinecolor: str, default: 'white' The color of the edges of points which represent outliers. boxlinecolor: str, default: 'black' The color of the edges of the boxes. whiskercolor: str, default: 'black' The color of whiskers (the vertical lines extending to the most extreme, non-outlier data points). capcolor: str, default: 'black' The color of caps (horizontal lines at the ends of the whiskers). medianlinecolor: str, default: 'orange' The color of the median lines in the boxes. multiplebox: bool, default: True If true, lines which represent every 5%-percentile from the 1st quartile (Q1) until the 3rd quartile (Q3) will be drawn. Returns ------- matplotlib.axes """ # input checking if isinstance(data, np.ndarray): assert len(data.shape) == 2, "The input should be 2-D array" assert data.dtype != '<U11', "The element in 2-D array should be numerical values" else: data = input_checking(data) # set x-axis and y-axis labels = [i + 1 for i in range(len(data))] y_min = min(min(data[i]) for i in range(len(data))) y_max = max(max(data[i]) for i in range(len(data))) ax.set_ylim(y_min - 0.1 * abs(y_max), y_max + 0.1 * (abs(y_max))) ax.set_xlim(0, len(labels) + 1) ax.set_xticks(labels) # set a box for each list of data for index in range(len(data)): width = 0.2 # set the width of the box and caps quantiles = np.percentile(data[index], (25, 50, 75)) # get the quantiles iqr = quantiles[2] - quantiles[0] low_bound = quantiles[0] - 1.5 * iqr # the lower bound of the box up_bound = quantiles[2] + 1.5 * iqr # the upper bound of the box # pick out and draw the outliers outliers = np.concatenate((data[index][low_bound > data[index]], data[index][up_bound < data[index]])) for o in outliers: trans = ( ax.figure.dpi_scale_trans + transforms.ScaledTranslation(labels[index], o, ax.transData)) circle = matplotlib.patches.Circle((0, 0), 0.04, edgecolor=outlierlinecolor, facecolor=outliercolor, transform=trans) ax.add_patch(circle) # do not consider outliers when drawing the boxplot data[index] = data[index][~np.isin(data[index], o)] # draw the whisker,caps and box # define the top of box box_top = min(max(data[index]), up_bound) # define the bottom of box box_bottom = max(min(data[index]), low_bound) # draw the bottom of box ax.hlines(quantiles[0], labels[index] - width, labels[index] + width, linewidth=1, color=boxlinecolor) # draw the median of box ax.hlines(quantiles[1], labels[index] - width, labels[index] + width, color=medianlinecolor, linewidth=1) # draw the top of box ax.hlines(quantiles[2], labels[index] - width, labels[index] + width, linewidth=1, color=boxlinecolor) # draw the high cap ax.hlines(box_top, labels[index] - width / 2, labels[index] + width / 2, linewidth=1, color=capcolor) # draw the low cap ax.hlines(box_bottom, labels[index] - width / 2, labels[index] + width / 2, linewidth=1, color=capcolor) # draw the low whisker ax.vlines(labels[index], ymin=box_bottom, ymax=quantiles[0], linewidth=1, color=whiskercolor) # draw the high whisker ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=1, color=whiskercolor) # draw the left bound of whisker ax.vlines(labels[index] - width, ymin=quantiles[0], ymax=quantiles[2], linewidth=1, color=boxlinecolor) # draw the right bound of whisker ax.vlines(labels[index] + width, ymin=quantiles[0], ymax=quantiles[2], linewidth=1, color=boxlinecolor) if multiplebox: per5 = np.percentile(data[index], (30, 35, 40, 45, 50, 55, 60, 65, 70), interpolation='midpoint') for k in range(len(per5)): ax.hlines(per5[k], labels[index] - width, labels[index] + width, linewidth=1, color=boxlinecolor) ax.hlines(quantiles[1], labels[index] - width, labels[index] + width, color=medianlinecolor, linewidth=3) # define the color of the box's face rect = plt.Rectangle((labels[index] - width, quantiles[0]), 2 * width, quantiles[2] - quantiles[0], color=facecolor) ax.add_patch(rect) return ax
#To only consider the area inside the box for detecting the digit #roi = Region of Interest roi = gray[upper_left[1]:bottom_right[1], upper_left[0]:bottom_right[0]] #convert cv2 to pil format im_pil = Image.fromarray(roi) #convert to grayscale image - 'L' format means each pixel is #represented by a single value from 0 to 255 image_bw = im_pil.convert('L') image_bw_resized = image_bw.resize((28, 28), Image.ANTIALIAS) image_bw_resized_inverted = PIL.ImageOps.invert(image_bw_resized) pixel_filter = 20 min_pixel = np.percentile(image_bw_resized_inverted, pixel_filter) image_bw_resized_inverted_scaled = np.clip( image_bw_resized_inverted - min_pixel, 0, 255) max_pixel = np.max(image_bw_resized_inverted) image_bw_resized_inverted_scaled = np.asarray( image_bw_resized_inverted_scaled) / max_pixel test_sample = np.array(image_bw_resized_inverted_scaled).reshape( 1, 784) test_pred = clf.predict(test_sample) print("Predicted class is :-", test_pred) # Display the resulting frame cv2.imshow('frame', gray) if cv2.waitKey(1) & 0xFF == ord('q'): break except Exception as e:
import os import numpy as np import sys sys.path.append('../') from word_count_helpers import * # Load the file names dataDir = "../../../data/word_count/" fnames = os.listdir(dataDir) allcounts = [] for fname in fnames: f = open(dataDir+fname, 'r', encoding='utf-8') text = cleantext(f.readlines()) allcounts.append(countwords(text)) globalcounts = dict() for counts in allcounts: globalcounts = { k: counts.get(k, 0) + globalcounts.get(k, 0) for k in set(counts) | set(globalcounts) } for counts in allcounts: counts = { k:v for k, v in counts.items() if v > np.percentile(list(counts.values()),98) } normcounts = { k: counts.get(k, 0) / globalcounts.get(k, 0) for k in set(counts) & set(globalcounts) } top5 = sorted(normcounts, key=normcounts.get, reverse=True)[:5] str ='' for k in top5: str=str + "%s: %s" % (k, normcounts[k]) + "; " print(str[:-2])
# the resolution and S/N cuts. Use these distributions to make cuts. for ind in range(len(fwhm_arr)): fwhm = fwhm_arr[ind] print 'Beginning work for FWHM=%.2f arcsec:' % fwhm # Get the resolution for this FWHM value. res = resolution_arr[:, ind] # Get the subsets that pass / fail the resolution>=1/3 cut. pass_cuts = res >= 1. / 3 fail_cuts = (1 - pass_cuts).astype(bool) # Find the 5th percentile in half-light radius for galaxies that pass the resolution cut. Then # check that if we cut there, what fraction of the galaxies that fail the resolution cut are # eliminated. cut_val = np.percentile(gal_hlr[pass_cuts], 5.) elim_frac = float(np.sum(gal_hlr[fail_cuts] < cut_val)) / len( gal_hlr[fail_cuts]) print ' Radius cut at %.3f arcsec eliminates a fraction %f of res failures' % ( cut_val, elim_frac) if do_plot: fig = plt.figure() ax = fig.add_subplot(111) n, bins, patches = ax.hist(gal_hlr[pass_cuts], np.linspace(0., 1., 21), facecolor='green', alpha=0.75) n, bins, patches = ax.hist(gal_hlr[fail_cuts], np.linspace(0., 1., 21), facecolor='red',
'LW': LW, } fit2 = pystan.stan(model_code=model, data=data, iter=5000, chains=4) la2 = fit2.extract() fit2 plt.figure(figsize=(15, 7)) cmap = matplotlib.cm.get_cmap('tab10') for j, player in enumerate(arr_target_player): samples = la2['mu'][:, j, :] medians = np.median(samples, axis=0) lower, upper = np.percentile(samples, q=[25.0, 75.0], axis=0) c = cmap(j) plt.plot(arr_target_year, medians, marker='o', label=player, color=c) plt.fill_between(arr_target_year, lower, upper, alpha=0.2, color=c) plt.xlabel('year') plt.ylabel('latent strength') plt.legend(loc='lower left', bbox_to_anchor=(1, 0.5)) plt.show() cmap = matplotlib.cm.get_cmap('tab10') for j, player in enumerate(arr_target_player):
def get_sum_metrics(batch_output, batch_target, metrics_type, test=False, printDice=False): if torch.is_tensor(batch_output): batch_output = batch_output.data.cpu().numpy() if torch.is_tensor(batch_target): batch_target = batch_target.data.cpu().numpy() assert batch_output.shape == batch_target.shape assert len(batch_output.shape) == 4 spacing = (1, 1) size = batch_output.shape[0] metrics = dict.fromkeys(metrics_type, 0) dices = [] for i in range(size): output = batch_output[i, 0] target = batch_target[i, 0] labelPred = sitk.GetImageFromArray(output, isVector=False) labelPred.SetSpacing(spacing) labelTrue = sitk.GetImageFromArray(target, isVector=False) labelTrue.SetSpacing(spacing) # spacing order (x, y, z) # voxel_metrics pred = output.astype(int) gdth = target.astype(int) fp_array = copy.deepcopy(pred) # keep pred unchanged fn_array = copy.deepcopy(gdth) gdth_sum = np.sum(gdth) pred_sum = np.sum(pred) intersection = gdth & pred union = gdth | pred intersection_sum = np.count_nonzero(intersection) union_sum = np.count_nonzero(union) tp_array = intersection tmp = pred - gdth fp_array[tmp < 1] = 0 tmp2 = gdth - pred fn_array[tmp2 < 1] = 0 tn_array = np.ones(gdth.shape) - union tp, fp, fn, tn = np.sum(tp_array), np.sum(fp_array), np.sum( fn_array), np.sum(tn_array) smooth = EPSILON precision = (tp) / (pred_sum + smooth) recall = (tp) / (gdth_sum + smooth) false_positive_rate = (fp) / (fp + tn + smooth) false_negtive_rate = (fn) / (fn + tp + smooth) jaccard = (intersection_sum) / (union_sum + smooth) dice = (2 * intersection_sum) / (gdth_sum + pred_sum + smooth) ppv = (intersection_sum) / (pred_sum + smooth) dicecomputer = sitk.LabelOverlapMeasuresImageFilter() dicecomputer.Execute(labelTrue > 0.5, labelPred > 0.5) # distance_metrics signed_distance_map = sitk.SignedMaurerDistanceMap( labelTrue > 0.5, squaredDistance=False, useImageSpacing=True) # It need to be adapted. ref_distance_map = sitk.Abs(signed_distance_map) ref_surface = sitk.LabelContour(labelTrue > 0.5, fullyConnected=True) statistics_image_filter = sitk.StatisticsImageFilter() statistics_image_filter.Execute(ref_surface > 0.5) num_ref_surface_pixels = int(statistics_image_filter.GetSum()) signed_distance_map_pred = sitk.SignedMaurerDistanceMap( labelPred > 0.5, squaredDistance=False, useImageSpacing=True) seg_distance_map = sitk.Abs(signed_distance_map_pred) seg_surface = sitk.LabelContour(labelPred > 0.5, fullyConnected=True) seg2ref_distance_map = ref_distance_map * sitk.Cast( seg_surface, sitk.sitkFloat32) ref2seg_distance_map = seg_distance_map * sitk.Cast( ref_surface, sitk.sitkFloat32) statistics_image_filter.Execute(seg_surface > 0.5) num_seg_surface_pixels = int(statistics_image_filter.GetSum()) seg2ref_distance_map_arr = sitk.GetArrayViewFromImage( seg2ref_distance_map) seg2ref_distances = list( seg2ref_distance_map_arr[seg2ref_distance_map_arr != 0]) seg2ref_distances = seg2ref_distances + list( np.zeros(num_seg_surface_pixels - len(seg2ref_distances))) ref2seg_distance_map_arr = sitk.GetArrayViewFromImage( ref2seg_distance_map) ref2seg_distances = list( ref2seg_distance_map_arr[ref2seg_distance_map_arr != 0]) ref2seg_distances = ref2seg_distances + list( np.zeros(num_ref_surface_pixels - len(ref2seg_distances))) # all_surface_distances = seg2ref_distances + ref2seg_distances metrics['dice'] += dice metrics['jaccard'] += jaccard metrics['precision'] += precision metrics['recall'] += recall metrics['fpr'] += false_positive_rate metrics['fnr'] += false_negtive_rate metrics['vs'] += dicecomputer.GetVolumeSimilarity() metrics['ppv'] += ppv metrics["msd"] += np.mean(all_surface_distances) metrics["mdsd"] += np.median(all_surface_distances) metrics["stdsd"] += np.std(all_surface_distances) metrics["hd95"] += np.percentile(all_surface_distances, 95) metrics["hd"] += np.max(all_surface_distances) if printDice: dices.append(dice) if printDice: return metrics, dices return metrics
import time output = sys.argv[1] if not os.path.exists('../' + output): os.makedirs('../' + output) with open('../inputs/mirror_roll/train.pickle', 'rb') as f: images = pickle.load(f) list1 = [] for itr in range(len(images['Label'])): trla = images['Label'][itr] trlaa = trla[0:1, :, :, :] label_ratio = (trlaa > 0).sum() / (trlaa.shape[1] * trlaa.shape[2] * trlaa.shape[3] - (trlaa > 0).sum()) list1.append(label_ratio) list2 = np.sort(list1) print('mean: ',np.mean(list2)) print('max: ',np.max(list2)) print('min: ', np.min(list2)) print('20%: ', np.percentile(list2,20)) print('80%: ', np.percentile(list2,80)) plt.hist(list2,bins=100) plt.title('label ratio distribution') plt.savefig('../' + output + '/dis.png') df = pd.DataFrame(list2) df.to_csv('../' + output + '/dis.csv',index_label = False)
csv_file.write('lat\tlng\tcluster\tname\tiso2\tnum_posts\n') for city, cid in sorted(zip(eligible_cities, cluster_ids)): (lat, lng), iso2 = locations[city] csv_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (lat, lng, cid, city, iso2, city_density[city])) elif args.pca: # vectors = np.array([model.docvecs[city] for city in eligible_cities]) # subtract negatives to guarantee positive numbers vectors -= vectors.min() pca = NMF(n_components=3, init='nndsvd', shuffle=True) rgb = pca.fit_transform(vectors) # scale values in RGB to [0-1] for component in range(3): lower_bound = np.percentile(rgb[:, component], 1, axis=0) upper_bound = np.percentile(rgb[:, component], 90, axis=0) rgb[:, component] -= lower_bound rgb[:, component] /= upper_bound - lower_bound color_names = np.clip(rgb, 0.0, 1.0) cMap = 'Reds' # make map from shape file # plot background map fs = (int(area.geometry.bounds.maxx.max() - area.geometry.bounds.minx.min())//1.5, int(area.geometry.bounds.maxy.max() - area.geometry.bounds.miny.min())) fig, ax = plt.subplots(figsize=fs) area.plot(ax=ax, edgecolor='black', facecolor='white', linewidth=1); if args.nuts: area.plot(ax=ax, edgecolor='black', facecolor=color_names, linewidth=1, alpha=0.6);
def main(): ################################################################################ ########## READ IN THE RAW PHOTOMETRY #################### ################################################################################# numecl = 0 plnm = 'WASP_101' verbose = 'false' fpath = '/Users/rahuljayaraman/Documents/Miscellany/Research (Tucker Group)/Python (Transits)/' + plnm aorlist = os.listdir(fpath) #aorlist= [item for item in aorlist if not item.startswith('.')] #aorlist=aor_from_list(plnm, 1) #aorlist=[50494976] aorlist = ['62158336', '62159360'] #aorlist=np.delete(aorlist, [0,1, len(aorlist)-1]) for aor in aorlist: print(aor) aor = str(aor) prisec = 'primary' ramp_style = 'none' fpathout = fpath + aor + '/apr_fits/' + ramp_style + '/' directory = os.path.dirname(fpathout) if not os.path.exists(directory): os.makedirs(directory) #dd=np.load('/Users/Brian/Desktop/Tucker_Group/t_1/outputs/'+plnm+'/'+aor) dd = np.load(fpath + '/' + aor + 'extraction.npz') t = dd['time'] all_lc = dd['lc'] #hp=dd['hp'] cp = dd['cp'] exptime = dd['exptime'] framtime = 0.1 orbparams = dd['op'] holdpos = dd['hold_pos'] npix = dd['beta_np'] chnum = dd['ch'] red_all = [] orbparams[6] = 2456164.6934 #only for wasp-101b ################################################################################ pred_ecl_time = get_pred_time(orbparams, t, prisec) print(orbparams) print(pred_ecl_time - t[0]) freeparams = [pred_ecl_time - t[0], orbparams[2]] if prisec == 'secondary': freeparams[1] = 0.0011 ldc = [] else: ldc = find_coeffs( orbparams[10], orbparams[9], orbparams[8], 2, 'quadratic') #(temp, log_g, metallicity, channel, type_limb) for apr in range(0, all_lc.shape[1]): directory = os.path.dirname(fpathout) if not os.path.exists(directory): os.makedirs(directory) lc = np.squeeze(all_lc[:, apr] * 2.35481) time = (t - t[0]) time = np.squeeze(time) norm = np.nanmedian(lc) #print('Photon Noise limit is: ',(np.sqrt(norm*1.002)/(norm*1.002))) err = 1.1 * lc**0.5 lc = lc / norm err = err / norm err = np.ones(len(lc)) * 0.0045 xpos = holdpos[:, 0] ypos = holdpos[:, 1] npix = dd['beta_np'] ################################################################################ ########## NORMALIZE THE PIXEL VALUES #################### ################################################################################ timelength = len(t) #cp1=cp[1:4, 1:4, :] cp1 = cp dep_ind = cp1.shape[0] * cp1.shape[1] cp2 = np.reshape(cp1, (dep_ind, timelength)) cp3 = cp2 #[:,start:end] for p in range(0, len(time)): norm = np.sum(cp3[:, p]) cp3[:, p] /= norm ################################################################################ ########## FILTER THE DATA #################### ################################################################################ #fpathout='/Users/Brian/Desktop/Tucker_Group/Spitzer/mapping_files/outputs/'+plnm+'/'+aor+'/apr_fits/' filt_file = fpathout + 'post_filter_' + str(apr) + '.npz' #print(filt_file) if os.path.isfile(filt_file): if verbose == 'true': print('Found Filter File') ff = np.load(filt_file) lc = ff['lc'] #cp3=ff['cp3'] time = ff['time'] xpos = ff['xpos'] ypos = ff['ypos'] npix = ff['npix'] err = ff['err'] found = 'true' else: found = 'false' if verbose == 'true': print('In Filter') lc, cp3, time, xpos, ypos, npix, err = filter_data( lc, cp3, time, xpos, ypos, npix, dep_ind, err) if verbose == 'true': print('Out of Filter') plt.figure() plt.title(plnm + ' Ch: ' + str(chnum) + '\n' + str(aor) + '_' + str(apr)) plt.axvline(x=pred_ecl_time - t[0]) plt.axvline(x=pred_ecl_time - orbparams[4] * 0.5 - t[0], color='r', linestyle='dashed') plt.axvline(x=pred_ecl_time + orbparams[4] * 0.5 - t[0], color='r', linestyle='dashed') plt.scatter(time, lc, s=1) if prisec == 'secondary': plt.ylim(0.95, 1.05) else: plt.ylim(0.95, 1.03) #plt.xlim(time[0], np.amax(time)) plt.savefig(fpathout + 'raw_lc_plot_' + str(apr)) if verbose == 'true': plt.draw() plt.pause(1200) plt.close('all') # time2=np.multiply(time, time) # time=time[np.newaxis] # time2=time2[np.newaxis] # t2hours=time2*24.0**2.0 # thours=time*24.0 ################################################################################ ########## TRIM THE DATA #################### ################################################################################ trim_time = 0. #in minutes if trim_time != 0.: trim_time = trim_time / (60. * 24.0) #convert to days start_index = int(trim_time / (exptime / 86400.0)) end_ind = np.squeeze(lc) end_ind = end_ind.size print(exptime) lc = lc[start_index:end_ind] time = np.squeeze(time[start_index:end_ind]) xpos = xpos[start_index:end_ind] ypos = ypos[start_index:end_ind] npix = npix[start_index:end_ind] err = err[start_index:end_ind] plt.figure() plt.scatter(time, lc, s=1) plt.draw() ################################################################################ ########## FIND NEIGHBORS #################### ################################################################################ if found == 'true': gw = ff['gw'] nbr = ff['nbr'] else: if verbose == 'true': print('In Find NBR') gw, nbr = find_nbr_qhull(xpos, ypos, npix, sm_num=50, a=1.0, b=1.7777, c=1.0, print_space=10000.) if verbose == 'true': print('Out of Find NBR') np.savez(fpathout + 'post_filter_' + str(apr), lc=lc, cp3=cp3, time=time, xpos=xpos, ypos=ypos, npix=npix, err=err, gw=gw, nbr=nbr, orbparams=orbparams, pred_ecl_time=pred_ecl_time) ################################################################################ ########## FIT THE DATA #################### ################################################################################ if prisec == 'secondary': freeparams = [pred_ecl_time - t[0], orbparams[2], 0.005, 0.05] #the last 2 free params are ramp terms else: if ramp_style == 'linear': freeparams = [ pred_ecl_time - t[0], orbparams[2], 0.00001, 1.000001 ] if ramp_style == 'exp': freeparams = [ pred_ecl_time - t[0], orbparams[2], 0.005, 0.05 ] if ramp_style == 'none': freeparams = [pred_ecl_time - t[0], orbparams[2], 1.0, 1.0] params, m = initialize_model(np.squeeze(time), freeparams, orbparams, prisec, ldc) fluxcurve = m.light_curve(params) fit_params, pcov, infodict, flag, sucess = leastsq( nnbr_res, freeparams, args=(time, lc, err, gw, nbr, params, m, prisec, ramp_style), full_output=1) print('apr# ' + str(apr), fit_params) file_name = fpathout + 'apr_fit_' + str(apr) fileObject = open(file_name, 'wb') pickle.dump([lc, time, err, gw, nbr, fit_params], fileObject) fileObject.close() ################################################################################ ########## PLOT THE FIT #################### ################################################################################ if prisec == 'secondary': params.t_secondary = fit_params[0] params.fp = fit_params[1] else: params.t0 = fit_params[0] params.rp = fit_params[1] eclipse_model = m.light_curve(params) ramp = ramp_model([fit_params[2], fit_params[3]], time, ramp_style) lc2 = np.squeeze(lc / eclipse_model / ramp) w1 = lc2[nbr] w2 = np.multiply(w1, gw) w3 = np.sum(w2, 1) w4 = np.divide(lc2, w3) w5 = w4 * eclipse_model resids = (w4 - 1.) #/err res2 = (lc / eclipse_model - 1.0) / err pltbins = 64 blc = bin_anything(w5, pltbins) btime = bin_anything(time, pltbins) if prisec == 'secondary': phase = 0.5 + (time + t[0] - pred_ecl_time) / orbparams[5] if prisec == 'primary': phase = 0.0 + (time + t[0] - pred_ecl_time) / orbparams[5] bphase = bin_anything(phase, pltbins) plt.figure() plt.title(plnm + ' Ch: ' + str(chnum) + '\n' + str(aor) + '_' + str(apr)) plt.scatter(bphase, blc, s=10) #plt.scatter(time, lc, alpha=0.1, color='b', s=1) plt.plot(np.squeeze(phase), eclipse_model, color='r') if prisec == 'secondary': plt.ylim(0.9975, 1.0035) plt.text( 0.47, 1.003, 'T_center O-C (s): ' + str( round((fit_params[0] + t[0] - pred_ecl_time) * 86400., 1)) + ' Depth: ' + str(round(fit_params[1] * 1.0e6, 0)) + ' ppm') plt.text(0.49, 1.0025, 'SDNR: ' + str(round(np.std(resids), 6))) else: plt.ylim(0.983, 1.005) plt.text( 0.43, 0.9925, 'T_center O-C (s): ' + str( round((fit_params[0] + t[0] - pred_ecl_time) * 86400., 1))) plt.text( 0.43, 0.990, 'Transit Depth: ' + str(round(fit_params[1]**2. * 100, 4)) + ' %') plt.text(0.43, 0.9875, 'SDNR: ' + str(round(np.std(resids), 6))) plt.xlabel('Phase Units') plt.ylabel('Relative Flux') plt.savefig(fpathout + 'apr_fit_plot_' + str(apr)) if verbose == 'true': plt.draw() plt.pause(1.2) ################################################################################ ########## Get Red Noise #################### ################################################################################ sdnr, beta_red = est_rednoise(resids, framtime, fpathout, aor, apr, plnm, chnum, prisec) if red_all == []: red_all = np.ones(shape=(all_lc.shape[1], 5)) * 1000. red_all[apr, :] = [ sdnr, beta_red * sdnr, beta_red, round(fit_params[1] * 1.e6, 1), fit_params[0] ] best = np.nanargmin(red_all, axis=0) best = best[1] np.save(fpathout + aor + '_summary', red_all) np.savetxt(fpathout + aor + '_summary', red_all) if verbose == 'true': print(best) ################################################################################ ########## Load the best apr results #################### ################################################################################ filename = fpathout + 'apr_fit_' + str(best) fileObject = open(filename, 'rb') lc, time, err, gw, nbr, fit_params = pickle.load(fileObject) err = err * red_all[best, 2] print('Best Beta_red', red_all[best, 2]) params, m = initialize_model(np.squeeze(time), freeparams, orbparams, prisec, ldc) ################################################################################ ########## run_mcmc #################### ################################################################################ theta = fit_params ndim, nwalkers = len(theta), 20 sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(time, lc, err, gw, nbr, params, m, prisec, ramp_style)) pos = [theta + 1.e-4 * np.random.randn(ndim) for i in range(nwalkers)] sampler.run_mcmc(pos, 1500) samples = sampler.chain[:, 50:, :].reshape((-1, ndim)) np.save(fpathout + aor + '_samples', samples) if prisec == 'primary': fig = corner.corner(samples, labels=["t0", "rp", "a1", "a2"]) #, "A/R", "inc"]) else: fig = corner.corner(samples, labels=["t0", "Fp", "a1", "a2"]) #, "A/R", "inc"]) fig.savefig(fpathout + aor + '_corner_' + str(best) + '.png') #plt.show(block=False) #plt.pause(0.5) #Derive error bars t0_mcmc, rp_mcmc, a1_mcmc, a2_mcmc = map( lambda v: (v[1], v[2] - v[1], v[1] - v[0]), zip(*np.percentile(samples, [16, 50, 84], axis=0))) print(rp_mcmc, t0_mcmc) np.savez(fpathout + aor + '_mcmc_results', rp_mcmc=rp_mcmc, t0_mcmc=t0_mcmc, a1_mcmc=a1_mcmc, a2_mcmc=a2_mcmc, best=best) phase = 0.0 + (time + t[0] - pred_ecl_time) / orbparams[5] bphase = bin_anything(phase, pltbins) plt.figure() for t0, rp, a1, a2 in samples[np.random.randint(len(samples), size=100)]: params.rp = rp params.t0 = t0 ecl_mod = m.light_curve(params) plt.plot(phase, ecl_mod, color='k', alpha=0.05) ramp = ramp_model([a1, a2], time, ramp_style) lc2 = np.squeeze(lc / ecl_mod / ramp) w1 = lc2[nbr] w2 = np.multiply(w1, gw) w3 = np.sum(w2, 1) w4 = np.divide(lc2, w3) w5 = w4 * ecl_mod resids = (w4 - 1.) #/err res2 = (lc / ecl_mod - 1.0) / err blc = bin_anything(w5, pltbins) btime = bin_anything(time, pltbins) plt.scatter(bphase, blc, s=8, alpha=0.5) plt.xlabel("Phase Units") plt.ylabel("Relative Flux") plt.title(plnm + ' Ch: ' + str(chnum)) plt.show() #plt.savefig('/Users/Brian/Desktop/W79_summary/'+str(chnum)+'_mcmc_fit') return None