def test_memory_leak(): import resource arr = np.arange(1).reshape((1, 1)) starting = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss for i in range(1000): for axis in [None, 0, 1]: bn.nansum(arr, axis=axis) bn.nanargmax(arr, axis=axis) bn.nanargmin(arr, axis=axis) bn.nanmedian(arr, axis=axis) bn.nansum(arr, axis=axis) bn.nanmean(arr, axis=axis) bn.nanmin(arr, axis=axis) bn.nanmax(arr, axis=axis) bn.nanvar(arr, axis=axis) ending = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss diff = ending - starting diff_bytes = diff * resource.getpagesize() print(diff_bytes) # For 1.3.0 release, this had value of ~100kB assert diff_bytes == 0
def test_nanvar_issue60(): "nanvar regression test (issue #60)" with warnings.catch_warnings(): warnings.simplefilter("ignore") f = bn.nanvar([1.0], ddof=1) with np.errstate(invalid='ignore'): s = bn.slow.nanvar([1.0], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1.0], ddof=1) wrong") f = bn.nanvar([1], ddof=1) with np.errstate(invalid='ignore'): s = bn.slow.nanvar([1], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1], ddof=1) wrong") f = bn.nanvar([1, np.nan], ddof=1) with np.errstate(invalid='ignore'): s = bn.slow.nanvar([1, np.nan], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1, nan], ddof=1) wrong") f = bn.nanvar([[1, np.nan], [np.nan, 1]], axis=0, ddof=1) with np.errstate(invalid='ignore'): s = bn.slow.nanvar([[1, np.nan], [np.nan, 1]], axis=0, ddof=1) assert_equal(f, s, err_msg="issue #60 regression")
def test_memory_leak() -> None: import resource arr = np.arange(1).reshape((1, 1)) n_attempts = 3 results = [] for _ in range(n_attempts): starting = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss for _ in range(1000): for axis in [None, 0, 1]: bn.nansum(arr, axis=axis) bn.nanargmax(arr, axis=axis) bn.nanargmin(arr, axis=axis) bn.nanmedian(arr, axis=axis) bn.nansum(arr, axis=axis) bn.nanmean(arr, axis=axis) bn.nanmin(arr, axis=axis) bn.nanmax(arr, axis=axis) bn.nanvar(arr, axis=axis) ending = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss diff = ending - starting diff_bytes = diff * resource.getpagesize() # For 1.3.0 release, this had value of ~100kB if diff_bytes: results.append(diff_bytes) else: break assert len(results) < n_attempts
def get_lugsail_batch_means_est(data_in, steps=None): m = len(data_in) T_iL = [] s_i = [] n_i = [] for data_chain, burnin_chain in data_in: data = data_chain[burnin_chain:steps] if data.size < 2: return np.inf # [chapter 2.2 in Vats and Knudson, 2018] n_ii = data.size b = int(n_ii**(1 / 2)) # Batch size. Alternative: n ** (1/3) n_i.append(n_ii) chain_mean = bn.nanmean(data) T_iL.append( 2 * get_tau_lugsail(b, data, chain_mean) \ - get_tau_lugsail(b // 3, data, chain_mean) ) s_i.append(bn.nanvar(data, ddof=1)) T_L = np.mean(T_iL) s = np.mean(s_i) n = np.round(np.mean(n_i)) sigma_L = ((n - 1) * s + T_L) / n # [eq. 5 in Vats and Knudson, 2018] R_L = np.sqrt(sigma_L / s) return R_L
def proportionality(x, y): num = bottleneck.nanvar(np.log1p(y) - np.log1p(x)) denom = (bottleneck.nanstd(np.log1p(x)) + bottleneck.nanstd(np.log1p(y)))**2 try: return num / denom except: return np.nan
def test_nanvar_issue60(): """nanvar regression test (issue #60)""" f = bn.nanvar([1.0], ddof=1) with np.errstate(invalid="ignore"): s = bn.slow.nanvar([1.0], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1.0], ddof=1) wrong") f = bn.nanvar([1], ddof=1) with np.errstate(invalid="ignore"): s = bn.slow.nanvar([1], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1], ddof=1) wrong") f = bn.nanvar([1, np.nan], ddof=1) with np.errstate(invalid="ignore"): s = bn.slow.nanvar([1, np.nan], ddof=1) assert_equal(f, s, err_msg="bn.nanvar([1, nan], ddof=1) wrong") f = bn.nanvar([[1, np.nan], [np.nan, 1]], axis=0, ddof=1) with np.errstate(invalid="ignore"): s = bn.slow.nanvar([[1, np.nan], [np.nan, 1]], axis=0, ddof=1) assert_equal(f, s, err_msg="issue #60 regression")
def welch(xs, ys, meanfcn=bn.nanmedian): """ Welch's statistic for equal means http://en.wikipedia.org/wiki/Welch%27s_t_test Parameters ---------- xs: np.array ys: np.array Returns ------- float """ xbar, ybar = map(meanfcn, (xs, ys)) sx2, sy2 = map(lambda zs: bn.nanvar(xs) + np.spacing(1), (xs, ys)) return np.abs(xbar - ybar)/np.sqrt(sx2/len(xs) + sy2/len(ys))
def fit(self, X, y, mask=None): """Fit Gaussian Naive Bayes according to X, y Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. mask : array-like, shape = [n_samples, n_features] Binary, 1 at unobserved features. Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='dense') n_samples, n_features = X.shape if n_samples != y.shape[0]: raise ValueError("X and y have incompatible shapes") if mask is not None: mask = array2d(mask) X = X.copy() X[mask] = np.nan self.classes_ = unique_y = np.unique(y) n_classes = unique_y.shape[0] self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self._n_ij = [] epsilon = 1e-9 for i, y_i in enumerate(unique_y): self.theta_[i, :] = bn.nanmean(X[y == y_i, :], axis=0) self.sigma_[i, :] = bn.nanvar(X[y == y_i, :], axis=0) + epsilon self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples self._n_ij.append(-0.5 * np.sum(np.log(np.pi * self.sigma_[i, :]))) self._logprior = np.log(self.class_prior_) return self
m = nanmedian(flux) flux = 1e6*(flux/m - 1) flux_err = 1e6*flux_err/m #fig, ax = plt.subplots() #ax.plot(data[:,0], data[:,1]) #fig.savefig(os.path.splitext(fpath_save)[0] + '.png', bbox_inches='tight') #plt.close(fig) # Save file: os.makedirs(os.path.dirname(fpath_save), exist_ok=True) np.savetxt(fpath_save, np.column_stack((time, flux, flux_err)), delimiter=' ', fmt=('%.8f', '%.16e', '%.16e')) # Calculate diagnostics: lc = LightCurve(time=time, flux=flux, flux_err=flux_err) variance = nanvar(flux, ddof=1) rms_hour = rms_timescale(lc, timescale=3600/86400) ptp = nanmedian(np.abs(np.diff(flux))) # Add target to TODO-list: diag.write("{variance:.16e},{rms_hour:.16e},{ptp:.16e}\n".format( variance=variance, rms_hour=rms_hour, ptp=ptp )) diag.write("#-------------------------------------------\n") print("DONE")
def ndcombine(arr, mask=None, copy=True, blank=np.nan, offsets=None, thresholds=[-np.inf, np.inf], zero=None, scale=None, weight=None, zero_kw={ 'cenfunc': 'median', 'stdfunc': 'std', 'std_ddof': 1 }, scale_kw={ 'cenfunc': 'median', 'stdfunc': 'std', 'std_ddof': 1 }, zero_to_0th=True, scale_to_0th=True, zero_section=None, scale_section=None, reject=None, cenfunc='median', sigma=[3., 3.], maxiters=3, ddof=1, nkeep=1, maxrej=None, n_minmax=[1, 1], rdnoise=0., gain=1., snoise=0., pclip=-0.5, combine='average', dtype='float32', memlimit=2.5e+9, irafmode=True, verbose=False, full=False, return_variance=False): if copy: arr = arr.copy() if np.array(arr).ndim == 1: raise ValueError("1-D array combination is not supported!") _mask = _set_mask(arr, mask) # _mask = propagated through this function. sigma_lower, sigma_upper = _set_sigma(sigma) nkeep, maxrej = _set_keeprej(arr, nkeep, maxrej, axis=0) cenfunc = _set_cenfunc(cenfunc) reject_fullname = _set_reject_name(reject) maxiters = int(maxiters) ddof = int(ddof) combfunc = _set_combfunc(combine, nameonly=False, nan=True) if verbose and reject is not None: print("- Rejection") if thresholds != [-np.inf, np.inf]: print(f"-- thresholds (low, upp) = {thresholds}") print(f"-- {reject=} ({irafmode=})") print(f"-- params: {nkeep=}, {maxrej=}, {maxiters=}, {cenfunc=}") if reject_fullname == "sigclip": print(f" (for sigclip): {sigma=}, {ddof=}") elif reject_fullname == "ccdclip": print(f" (for ccdclip): {gain=}, {rdnoise=}, {snoise=}") # elif reject_fullnme == "pclip": # print(f" (for pclip) : spclip={pclip}") # elif reject_fullname == "minmax": # print(f" (for minmaxclip): n_minmax={n_minmax}") # == 01 - Thresholding + Initial masking ============================================= # # Updating mask: _mask = _mask | mask_thresh mask_thresh = _set_thresh_mask(arr=arr, mask=_mask, thresholds=thresholds, update_mask=True) # if safemode: # # Backup the pixels which are rejected by thresholding and # initial # mask for future restoration (see below) for debugging # purpose. # backup_thresh = arr[mask_thresh] # backup_thresh_inmask = arr[_mask] # TODO: remove this np.nan and instead, let `get_zsw` to accept mask. arr[_mask] = np.nan # ------------------------------------------------------------------------------------ # # == 02 - Calculate zero, scale, weights ============================================= # # This should be done before rejection but after threshold masking.. zeros, scales, weights = get_zsw(arr=arr, zero=zero, scale=scale, weight=weight, zero_kw=zero_kw, scale_kw=scale_kw, zero_to_0th=zero_to_0th, scale_to_0th=scale_to_0th, zero_section=zero_section, scale_section=scale_section) arr = do_zs(arr, zeros=zeros, scales=scales) # ------------------------------------------------------------------------------------ # # == 02 - Rejection ================================================================== # if isinstance(reject_fullname, str): if reject_fullname == 'sigclip': _mask_rej = sigclip_mask(arr, mask=_mask, sigma_lower=sigma_lower, sigma_upper=sigma_upper, maxiters=maxiters, ddof=ddof, nkeep=nkeep, maxrej=maxrej, cenfunc=cenfunc, axis=0, irafmode=irafmode, full=full) elif reject_fullname == 'minmax': _mask_rej = minmax_mask(arr, mask=_mask, n_minmax=n_minmax, full=full) elif reject_fullname == 'ccdclip': _mask_rej = ccdclip_mask(arr, mask=_mask, sigma_lower=sigma_lower, sigma_upper=sigma_upper, scale_ref=np.mean(scales), zero_ref=np.mean(zeros), maxiters=maxiters, ddof=ddof, nkeep=nkeep, maxrej=maxrej, cenfunc=cenfunc, axis=0, gain=gain, rdnoise=rdnoise, snoise=snoise, irafmode=irafmode, full=True) elif reject_fullname == 'pclip': pass else: raise ValueError("reject not understood.") if full: _mask_rej, low, upp, nit, rejcode = _mask_rej # _mask is a subset of _mask_rej, so to extract pixels which are # masked PURELY due to the rejection is: mask_rej = _mask_rej ^ _mask elif reject_fullname is None: mask_rej = _set_mask(arr, None) if full: low = bn.nanmin(arr, axis=0) upp = bn.nanmax(arr, axis=0) nit = None rejcode = None else: raise ValueError("reject not understood.") if reject is not None and verbose: print("Done.") _mask |= mask_rej # ------------------------------------------------------------------------------------ # # TODO: add "grow" rejection here? # == 03 - combine ==================================================================== # # Replace rejected / masked pixel to NaN and backup for debugging purpose. This is done to reduce # memory (instead of doing _arr = arr.copy()) # backup_nan = arr[_mask] if verbose: print("- Combining") print(f"-- combine = {combine}") arr[_mask] = np.nan # Combine and calc sigma comb = combfunc(arr, axis=0) if verbose: print("Done.") # Restore NaN-replaced pixels of arr for debugging purpose. # arr[_mask] = backup_nan # arr[mask_thresh] = backup_thresh_inmask if full: if verbose: print("- Error calculation") print("-- to skip this, use `full=False`") print(f"-- return_variance={return_variance}, ddof={ddof}") if return_variance: err = bn.nanvar(arr, ddof=ddof, axis=0) else: err = bn.nanstd(arr, ddof=ddof, axis=0) if verbose: print("Done.") return comb, err, mask_rej, mask_thresh, low, upp, nit, rejcode else: return comb
def load_star(self, task, fname): """ Receive a task from the TaskManager, loads the lightcurve and returns derived features. Parameters: task (dict): Task dictionary as returned by :func:`TaskManager.get_task`. fname (str): Path to lightcurve file associated with task. Returns: dict: Dictionary with features. See Also: :py:func:`TaskManager.get_task` .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) # Define variables used below: features = {} save_to_cache = False # The Meta-classifier is only using features from the other classifiers, # so there is no reason to load lightcurves and calculate/load any other classifiers: if self.classifier_key != 'meta': # Load features from cache file, or calculate them # and put them into cache file for other classifiers # to use later on: if self.features_cache: features_file = os.path.join(self.features_cache, 'features-' + str(task['priority']) + '.pickle') if os.path.exists(features_file): features = loadPickle(features_file) # Load lightcurve file and create a TessLightCurve object: if 'lightcurve' in features: lightcurve = features['lightcurve'] else: lightcurve = load_lightcurve(fname, starid=task['starid'], truncate_lightcurve=self.truncate_lightcurves) # No features found in cache, so calculate them: if not features: save_to_cache = True features = self.calc_features(lightcurve) # Add the fields from the task to the list of features: for key in ('tmag', 'variance', 'rms_hour', 'ptp', 'other_classifiers'): if key in task.keys(): features[key] = task[key] else: logger.warning("Key '%s' not found in task.", key) features[key] = np.NaN # If these features were not provided with the task, i.e. they # have not been pre-computed, we should compute them now: if features['variance'] is None or not np.isfinite(features['variance']): features['variance'] = nanvar(lightcurve.flux, ddof=1) if features['rms_hour'] is None or not np.isfinite(features['rms_hour']): features['rms_hour'] = rms_timescale(lightcurve) if features['ptp'] is None or not np.isfinite(features['ptp']): features['ptp'] = ptp(lightcurve) # Save features in cache file for later use: if save_to_cache and self.features_cache: savePickle(features_file, features) # Add the fields from the task to the list of features: features['priority'] = task['priority'] features['starid'] = task['starid'] logger.debug(features) return features
def correct(self, task, output_folder=None): """ Run correction. Parameters: task (dict): Dictionary defining a task/lightcurve to process. output_folder (str, optional): Path to directory where lightcurve should be saved. Returns: dict: Result dictionary containing information about the processing. .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) t1 = default_timer() error_msg = [] details = {} save_file = None result = task.copy() try: # Load the lightcurve lc = self.load_lightcurve(task) # Run the correction on this lightcurve: lc_corr, status = self.do_correction(lc) except (KeyboardInterrupt, SystemExit): # pragma: no cover status = STATUS.ABORT logger.warning("Correction was aborted (priority=%d)", task['priority']) except: # noqa: E722 pragma: no cover status = STATUS.ERROR logger.exception("Correction failed (priority=%d)", task['priority']) # Check that the status has been changed: if status == STATUS.UNKNOWN: # pragma: no cover raise ValueError("STATUS was not set by do_correction") # Do sanity checks: if status in (STATUS.OK, STATUS.WARNING): # Make sure all NaN fluxes have corresponding NaN errors: lc_corr.flux_err[np.isnan(lc_corr.flux)] = np.NaN # Simple check that entire lightcurve is not NaN: if allnan(lc_corr.flux): logger.error("Final lightcurve is all NaNs") status = STATUS.ERROR if allnan(lc_corr.flux_err): logger.error("Final lightcurve errors are all NaNs") status = STATUS.ERROR if np.any(np.isinf(lc_corr.flux)): logger.error("Final lightcurve contains Inf") status = STATUS.ERROR if np.any(np.isinf(lc_corr.flux_err)): logger.error("Final lightcurve errors contains Inf") status = STATUS.ERROR # Calculate diagnostics: if status in (STATUS.OK, STATUS.WARNING): # Calculate diagnostics: details['variance'] = nanvar(lc_corr.flux, ddof=1) details['rms_hour'] = rms_timescale(lc_corr, timescale=3600 / 86400) details['ptp'] = ptp(lc_corr) # Diagnostics specific to the method: if self.CorrMethod == 'cbv': details['cbv_num'] = lc_corr.meta['additional_headers'][ 'CBV_NUM'] elif self.CorrMethod == 'ensemble': details['ens_num'] = lc_corr.meta['additional_headers'][ 'ENS_NUM'] details['ens_fom'] = lc_corr.meta['FOM'] # Save the lightcurve to file: try: save_file = self.save_lightcurve(lc_corr, output_folder=output_folder) except (KeyboardInterrupt, SystemExit): # pragma: no cover status = STATUS.ABORT logger.warning("Correction was aborted (priority=%d)", task['priority']) except: # noqa: E722 pragma: no cover status = STATUS.ERROR logger.exception( "Could not save lightcurve file (priority=%d)", task['priority']) # Plot the final lightcurve: if self.plot: fig = plt.figure(dpi=200) ax = fig.add_subplot(111) ax.scatter(lc.time, 1e6 * (lc.flux / nanmedian(lc.flux) - 1), s=2, alpha=0.3, marker='o', label="Original") ax.scatter(lc_corr.time, lc_corr.flux, s=2, alpha=0.3, marker='o', label="Corrected") ax.set_xlabel('Time (TBJD)') ax.set_ylabel('Relative flux (ppm)') ax.legend() save_figure(os.path.join(self.plot_folder(lc), self.CorrMethod + '_final'), fig=fig) plt.close(fig) # Unpack any errors or warnings that were sent to the logger during the correction: if self.message_queue: error_msg += self.message_queue self.message_queue.clear() if not error_msg: error_msg = None # Update results: t2 = default_timer() details['errors'] = error_msg result.update({ 'corrector': self.CorrMethod, 'status_corr': status, 'elaptime_corr': t2 - t1, 'lightcurve_corr': save_file, 'details': details }) return result
def stats_area(self, loc, tol=0, lmean=False, lmed=False, lskew=False, lvar=False, lstd=False, lcoefvar=False, lperc=False, p=0.95, save=False): """Calculate some statistics among every realisation, considering a circular (only horizontaly) area of radius `tol` around the point located at `loc`. Parameters ---------- loc : array_like Location of the vertical line [x, y]. tol : number, default 0 Tolerance radius used to search for neighbour nodes. lmean : boolean, default False Calculate the mean. lmed : boolean, default False Calculate the median. lskew : boolean, default False Calculate skewness. lvar : boolean, default False Calculate the variance. lstd : boolean, default False Calculate the standard deviation. lcoefvar : boolean, default False Calculate the coefficient of variation. lperc : boolean, default False Calculate the percentile `100 * (1 - p)`. p : number, default 0.95 Probability value. save : boolean, default False Write the points used to calculate the chosen statistics in PointSet format to a file named 'sim values at (x, y, line).prn'. Returns ------- statspset : PointSet PointSet instance containing the calculated statistics. .. TODO: checkar stats variance com geoms """ if lmean: meanline = np.zeros(self.dz) if lmed: medline = np.zeros(self.dz) if lskew: skewline = np.zeros(self.dz) if lvar: varline = np.zeros(self.dz) if lstd: stdline = np.zeros(self.dz) if lcoefvar: coefvarline = np.zeros(self.dz) if lperc: percline = np.zeros((self.dz, 2)) # convert the coordinates of the first point to grid nodes loc = coord_to_grid(loc, [self.cellx, self.celly, self.cellz], [self.xi, self.yi, self.zi])[:2] # find the nodes coordinates within a circle centred in the first point neighbours_nodes = circle(loc[0], loc[1], tol) # compute the lines numbers for each point in the neighbourhood, across # each grid layer. this yields a N*M matrix, with N equal to the number # of neighbour nodes, and M equal to the number of layers in the grid. neighbours_lines = [line_zmirror(node, [self.dx, self.dy, self.dz]) for node in neighbours_nodes] # sort the lines in ascending order neighbours_lines = np.sort(neighbours_lines, axis=0) # create an array to store the neighbour nodes in each grid file nnodes = neighbours_lines.shape[0] arr = np.zeros(self.nfiles * nnodes) skip = True curr_line = np.zeros(self.nfiles) for layer in xrange(neighbours_lines.shape[1]): for i, line in enumerate(neighbours_lines[:, layer]): for j, grid in enumerate(self.files): # skip header lines only once per grid file if skip and self.header: skip_lines(grid, self.header) # advance to the next line with a neighbour node skip_lines(grid, int(line - curr_line[j] - 1)) # read the line and store its value a = grid.readline() arr[i + j * nnodes] = float(a) curr_line[j] = line skip = False # replace no data's with NaN bn.replace(arr, self.nodata, np.nan) # compute the required statistics if lmean: meanline[layer] = bn.nanmean(arr) if lmed: medline[layer] = bn.nanmedian(arr) if lskew: skewline[layer] = pd.Series(arr).skew() if lvar: varline[layer] = bn.nanvar(arr, ddof=1) if lstd: stdline[layer] = bn.nanstd(arr, ddof=1) if lcoefvar: if lstd and lmean: coefvarline[layer] = stdline[layer] / meanline[layer] * 100 else: std = bn.nanstd(arr, ddof=1) mean = bn.nanmean(arr) coefvarline[layer] = std / mean * 100 if lperc: percline[layer] = pd.Series(arr).quantile([(1 - p) / 2, 1 - (1 - p) / 2]) if save and tol == 0: # FIXME: not working with the tolerance feature # need to adjust the arrpset or cherry-pick arr arrpset = PointSet('realisations at location ({0}, {1}, {2})'. format(loc[0], loc[1], layer * self.cellz + self.zi), self.nodata, 3, ['x', 'y', 'value'], values=np.zeros((self.nfiles, 3))) arrout = os.path.join(os.path.dirname(self.files[0].name), 'sim values at ({0}, {1}, {2}).prn'.format( loc[0], loc[1], layer * self.cellz + self.zi)) arrpset.values.iloc[:, 2] = arr arrpset.values.iloc[:, :2] = np.repeat(np.array(loc) [np.newaxis, :], self.nfiles, axis=0) arrpset.save(arrout, header=True) ncols = sum((lmean, lmed, lvar, lstd, lcoefvar, lskew)) if lperc: ncols += 2 statspset = PointSet(name='vertical line stats at (x,y) = ({0},{1})'. format(loc[0], loc[1]), nodata=self.nodata, nvars=3 + ncols, varnames=['x', 'y', 'z'], values=np.zeros((self.dz, 3 + ncols))) statspset.values.iloc[:, :3] = (np.column_stack (((np.repeat(np.array(loc) [np.newaxis, :], self.dz, axis=0)), np.arange(self.zi, self.zi + self.cellz * self.dz)))) j = 3 if lmean: statspset.varnames.append('mean') statspset.values.iloc[:, j] = meanline j += 1 if lmed: statspset.varnames.append('median') statspset.values.iloc[:, j] = medline j += 1 if lskew: statspset.varnames.append('skewness') statspset.values.iloc[:, j] = skewline j += 1 if lvar: statspset.varnames.append('variance') statspset.values.iloc[:, j] = varline j += 1 if lstd: statspset.varnames.append('std') statspset.values.iloc[:, j] = stdline j += 1 if lcoefvar: statspset.varnames.append('coefvar') statspset.values.iloc[:, j] = coefvarline j += 1 if lperc: statspset.varnames.append('lperc') statspset.varnames.append('rperc') statspset.values.iloc[:, -2:] = percline # reset the reading pointer in each grid file self.reset_read() # update varnames statspset.flush_varnames() return statspset
def time_nanvar(self, dtype, shape): bn.nanvar(self.arr)
def calculate_beta(returns: pd.DataFrame, in_flag: pd.DataFrame, mkt: pd.DataFrame = None, class_df: pd.DataFrame = None, output_freq="D", window_size=252, k=5, universe=True, target_dates=None, len_beta=None, minimum_coverage=None) -> pd.DataFrame: """ Beta calculation is very fast in nature. No need to utilize parallel computation :param returns: daily stock return DataFrame :param in_flag: daily in flag DataFrame :param mkt: daily market return DataFrame; this return is used as the benchmark (regressor) in a CAPM model; if it is an one row dataframe, then there is only one benchmark (e.g., equal weighted market return); if it has the same shape as returns, then different stocks may correspond to different benchmarks (e.g., industry return) :param class_df: DataFrame, the class each stock corresponds to (e.g., GICS industry); by default, returns, in_flag and class_df should be of the same shape and have their indexes (ID) & columns (dates) aligned :param output_freq: {'M', 'D'}; 'M' for monthly, 'D' for daily :param window_size: used in combination with output_freq; usually 252D (252 days) or 12M (12 months) :param k: a parameter used to determine outlier returns; only effective when mkt is None :param universe: if True, in_flag will be used to tell whether a stock is in the corresponding universe on a specific day :param target_dates: list-like, the specific dates that betas are calculated for :param len_beta: number of cross-sections that betas are calculated for :param minimum_coverage: the minimum percentage of non-missing returns needed to calculate betas :return: DataFrame, beta matrix """ assert output_freq.lower() in ('d', 'm') if minimum_coverage is None or minimum_coverage > 1 or minimum_coverage < 0: minimum_coverage = 0.75 if class_df is None: class_df = returns.values.copy() if universe: class_df[in_flag.values != 1] = np.nan class_df[np.isfinite(class_df)] = 1 class_df = pd.DataFrame(class_df, index=returns.index, columns=returns.columns) diff_class = np.unique(class_df.values[~np.isnan(class_df)]) period_end = frequency_convert(returns.columns, output_freq) period_end_idx = np.array( [np.argwhere(returns.columns == x)[0, 0] for x in period_end]) selected_idx = range( window_size + np.argwhere((period_end_idx + 1) > np.argwhere( np.sum(np.isfinite(returns.values), axis=0) > 1)[0, 0])[0, 0], len(period_end_idx)) if target_dates is None and len_beta is not None and len_beta > 0: selected_idx = range(selected_idx.stop - len_beta, selected_idx.stop) if target_dates is not None: target_idx = [ np.argwhere(period_end == x).flatten()[0] for x in target_dates ] selected_idx_2 = [x for x in target_idx if x in selected_idx] selected_idx = target_idx output = np.full((returns.shape[0], len(selected_idx)), np.nan) for col_id, c_sel_idx in tqdm(enumerate(selected_idx)): if target_dates is not None and c_sel_idx not in selected_idx_2: continue for c_class in diff_class: if universe: c_mask = ( (in_flag.loc[:, period_end[c_sel_idx]] == 1) & (class_df.loc[:, period_end[c_sel_idx]] == c_class)).values else: c_mask = ( class_df.loc[:, period_end[c_sel_idx]] == c_class).values if c_mask.any(): c_idx = in_flag.values[c_mask, ( period_end_idx[c_sel_idx - window_size] + 1):(period_end_idx[c_sel_idx] + 1)] == 1 c_rtn = returns.values[c_mask, ( period_end_idx[c_sel_idx - window_size] + 1):(period_end_idx[c_sel_idx] + 1)] if mkt is None: # if mkt is not provided, calculate the ew-market return as the market return benchmark cc_rtn = c_rtn.copy() cc_rtn[~c_idx] = np.nan if k > 1: cc_rtn = remove_outliers(cc_rtn, k=k, set_na=False) else: cc_rtn[cc_rtn > k] = k mkt_rtn = bn.nanmean(cc_rtn, axis=0) else: c_rtn_columns = returns.columns[( period_end_idx[c_sel_idx - window_size] + 1):(period_end_idx[c_sel_idx] + 1)] dates = np.intersect1d(mkt.columns, c_rtn_columns, assume_unique=True) c_rtn = pd.DataFrame(c_rtn, columns=c_rtn_columns) c_rtn = c_rtn.loc[:, dates].values mkt_rtn = mkt.loc[:, returns.columns[period_end_idx[ c_sel_idx - window_size]]:dates[-1]] mkt_rtn_cols = mkt_rtn.columns mkt_rtn = mkt_rtn.values.copy() if mkt_rtn.shape[0] > 1: mkt_rtn = mkt_rtn[c_mask, :] mkt_rtn[np.isnan(mkt_rtn)] = 0 wealth = np.exp(np.log(1 + mkt_rtn).cumsum(axis=1)) mkt_rtn = pd.DataFrame( wealth / shift_2darray(wealth, 1, axis=1) - 1, columns=mkt_rtn_cols) mkt_rtn = mkt_rtn.loc[:, dates].values mask_beta = np.sum(np.isfinite(c_rtn), axis=1) >= ( period_end_idx[c_sel_idx] - period_end_idx[c_sel_idx - window_size]) * minimum_coverage if mask_beta.any(): mkt_var = bn.nanvar(mkt_rtn, ddof=1, axis=1) if mkt_rtn.shape[0] == 1: c_beta = pairwise_covariance(c_rtn[mask_beta], mkt_rtn) / mkt_var else: c_beta = pairwise_covariance( c_rtn[mask_beta], mkt_rtn[mask_beta]) / mkt_var[mask_beta] output[np.argwhere(c_mask).flatten()[mask_beta], col_id] = c_beta return pd.DataFrame(output, index=returns.index, columns=period_end[selected_idx])
def correct(self, task, output_folder=None): """ Run correction. Parameters: task (dict): Dictionary defining a task/lightcurve to process. output_folder (string, optional): Path to directory where lightcurve should be saved. Returns: dict: Result dictionary containing information about the processing. .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) t1 = default_timer() error_msg = None save_file = None result = task.copy() try: # Load the lightcurve lc = self.load_lightcurve(task) # Run the correction on this lightcurve: lc_corr, status = self.do_correction(lc) except (KeyboardInterrupt, SystemExit): status = STATUS.ABORT logger.warning("Correction was aborted.") except: status = STATUS.ERROR error_msg = traceback.format_exc().strip() logger.exception("Correction failed.") # Check that the status has been changed: if status == STATUS.UNKNOWN: raise Exception("STATUS was not set by do_correction") # Calculate diagnostics: details = {} if status in (STATUS.OK, STATUS.WARNING): # Calculate diagnostics: details['variance'] = nanvar(lc_corr.flux, ddof=1) details['rms_hour'] = rms_timescale(lc_corr, timescale=3600/86400) details['ptp'] = nanmedian(np.abs(np.diff(lc_corr.flux))) # TODO: set outputs; self._details = self.lightcurve, etc. save_file = self.save_lightcurve(lc_corr, output_folder=output_folder) # Plot the final lightcurve: if self.plot: fig = plt.figure(dpi=200) ax = fig.add_subplot(111) ax.scatter(lc.time, 1e6*(lc.flux/nanmedian(lc.flux)-1), s=2, alpha=0.3, marker='o', label="Original") ax.scatter(lc_corr.time, lc_corr.flux, s=2, alpha=0.3, marker='o', label="Corrected") ax.set_xlabel('Time (TBJD)') ax.set_ylabel('Relative flux (ppm)') ax.legend() save_figure(os.path.join(self.plot_folder(lc), self.CorrMethod + '_final'), fig=fig) plt.close(fig) # Construct result dictionary from the original task result = lc_corr.meta['task'].copy() # Update results: t2 = default_timer() details['errors'] = error_msg result.update({ 'status_corr': status, 'elaptime_corr': t2-t1, 'lightcurve_corr': save_file, 'details': details }) return result
def find_center_row(data): # Create interpolator for the median profile interp = scipy.interpolate.interp1d( x=data[:,0], y=data[:,1], kind='linear', bounds_error=False, fill_value=numpy.NaN) # # Optimization routine # def fold_profile(p, interp, maxy, count): dx = numpy.arange(maxy, dtype=numpy.float) x_left = p[0] - dx x_right = p[0] + dx profile_left = interp(x_left) profile_right = interp(x_right) diff = profile_left - profile_right count[0] += 1 # print "iteration %d --> %e" % (count[0], p[0]) # with open("opt_%d.del" % (count[0]), "w") as f: # numpy.savetxt(f, profile_left) # print >>f, "\n"*5, # numpy.savetxt(f, profile_right) # print >>f, "\n"*5, # numpy.savetxt(f, diff) return diff[numpy.isfinite(diff)] # # Get rid of all points that are too noisy # w=5 noise = numpy.array([bottleneck.nanvar(data[i-w:i+w,1]) for i in range(w,data.shape[0]-w+1)]) # numpy.savetxt("median_noise", noise) noise[:w] = numpy.NaN noise[-w:] = numpy.NaN for iteration in range(3): valid = numpy.isfinite(noise) _perc = numpy.percentile(noise[valid], [16,50,84]) _med = _perc[1] _sigma = 0.5*(_perc[2]-_perc[0]) outlier = (noise > _med+3*_sigma) | (noise < _med - 3*_sigma) noise[outlier] = numpy.NaN #numpy.savetxt("median_noise2", noise) valid = numpy.isfinite(noise) data[:,1][~valid] = numpy.NaN #numpy.savetxt("median_noise3", data) count=[0] fit_all = scipy.optimize.leastsq( func=fold_profile, x0=[data.shape[0]/5.], args=(interp, data.shape[0]/2,count), full_output=True, epsfcn=1e-1, ) #print fit_all[0] return fit_all[0][0]
def correct(self, task): """ Run correction. Parameters: task (dict): Dictionary defining a task/lightcurve to process. Returns: dict: Result dictionary containing information about the processing. .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) t1 = default_timer() error_msg = None save_file = None result = task.copy() try: # Load the lightcurve lc = self.load_lightcurve(task) # Run the correction on this lightcurve: lc, status = self.do_correction(lc) except (KeyboardInterrupt, SystemExit): status = STATUS.ABORT logger.warning("Correction was aborted.") except: status = STATUS.ERROR error_msg = traceback.format_exc().strip() logger.exception("Correction failed.") # Check that the status has been changed: if status == STATUS.UNKNOWN: raise Exception("STATUS was not set by do_correction") # Calculate diagnostics: details = {} if status in (STATUS.OK, STATUS.WARNING): # Calculate diagnostics: details['variance'] = nanvar(lc.flux, ddof=1) details['rms_hour'] = rms_timescale(lc, timescale=3600 / 86400) details['ptp'] = nanmedian(np.abs(np.diff(lc.flux))) # TODO: set outputs; self._details = self.lightcurve, etc. save_file = self.save_lightcurve(lc) # Construct result dictionary from the original task result = lc.meta['task'].copy() # Update results: t2 = default_timer() details['errors'] = error_msg result.update({ 'status_corr': status, 'elaptime_corr': t2 - t1, 'lightcurve_corr': save_file, 'details': details }) return result
def time_nanvar(self, dtype, shape, order, axis): bn.nanvar(self.arr, axis=axis)
def generate_todolist(self): """ Generate todo.sqlite file in training set directory. .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) try: with closing(sqlite3.connect(self.todo_file)) as conn: conn.row_factory = sqlite3.Row cursor = conn.cursor() # Create the basic file structure of a TODO-list: todolist_structure(conn) logger.info( "Step 3: Reading file and extracting information...") pri = 0 diagnostics_file = os.path.join(self.input_folder, 'diagnostics.txt') diagnostics = None if os.path.isfile(diagnostics_file): diagnostics = np.genfromtxt(diagnostics_file, delimiter=',', comments='#', dtype=None, encoding='utf-8') for k, star in tqdm(enumerate(self.starlist), total=len(self.starlist)): # Get starid: starname = star[0] starclass = star[1] if starname.startswith('constant_'): starid = -10000 - int(starname[9:]) elif starname.startswith('fakerrlyr_'): starid = -20000 - int(starname[10:]) else: starid = int(starname) starname = '{0:09d}'.format(starid) # Path to lightcurve: lightcurve = starclass + '/' + starname + '.txt' # Check that the file actually exists: if not os.path.exists( os.path.join(self.input_folder, lightcurve)): raise FileNotFoundError(lightcurve) # Load diagnostics from file, to speed up the process: if diagnostics is not None: variance, rms_hour, ptp = diagnostics[k] else: # Try to load the lightcurve using the BaseClassifier method. # This will ensure that the lightcurve can actually be read by the system. lc = io.load_lightcurve( os.path.join(self.input_folder, lightcurve)) variance = nanvar(lc.flux, ddof=1) rms_hour = utilities.rms_timescale(lc) ptp = utilities.ptp(lc) #if datasource is None: # if (lc.time[1] - lc.time[0])*86400 > 1000: # datasource = 'ffi' # else: # datasource = 'tpf' elaptime = np.random.normal(3.14, 0.5) pri += 1 todolist_insert(cursor, priority=pri, starid=starid, lightcurve=lightcurve, datasource='ffi', variance=variance, rms_hour=rms_hour, ptp=ptp, elaptime=elaptime) conn.commit() todolist_cleanup(conn, cursor) cursor.close() except: # noqa: E722, pragma: no cover if os.path.exists(self.todo_file): os.remove(self.todo_file) raise logger.info("%s training set successfully built.", self.key)
def test_known_star(SHARED_INPUT_DIR, corrector, starid, cadence, var_goal, rms_goal, ptp_goal): """ Check that the ensemble returns values that are reasonable and within expected bounds """ # All stars we check here come from the same sector and camera. # Define these here for the future where we may test on other combinations of these: sector = 1 camera = 1 __dir__ = os.path.abspath(os.path.dirname(__file__)) logger = logging.getLogger(__name__) logger.info("-------------------------------------------------------------") logger.info("CORRECTOR = %s, SECTOR=%d, CADENCE=%s, STARID=%d", corrector, sector, cadence, starid) # All stars are from the same CCD, find the task for it: with corrections.TaskManager(SHARED_INPUT_DIR) as tm: task = tm.get_task(starid=starid, sector=sector, camera=camera, cadence=cadence) # Check that task was actually found: assert task is not None, "Task could not be found" # Load lightcurve that will also be plotted together with the result: # This lightcurve is of the same objects, at a state where it was deemed that the # corrections were doing a good job. compare_lc_path = os.path.join(__dir__, 'compare', f'compare-{corrector}-s{sector:04d}-c{cadence:04d}-tic{starid:011d}.ecsv.gz') compare_lc = None if os.path.isfile(compare_lc_path): compare_lc = Table.read(compare_lc_path, format='ascii.ecsv') else: warnings.warn("Comparison data does not exist: " + compare_lc_path) # Initiate the class CorrClass = corrections.corrclass(corrector) with tempfile.TemporaryDirectory() as tmpdir: with CorrClass(SHARED_INPUT_DIR, plot=True) as corr: # Check basic parameters of object (from BaseCorrector): assert corr.input_folder == SHARED_INPUT_DIR, "Incorrect input folder" assert corr.plot, "Plot parameter passed appropriately" assert os.path.isdir(corr.data_folder), "DATA_FOLDER doesn't exist" # Load the input lightcurve: inlc = corr.load_lightcurve(task) # Print input lightcurve properties: print( inlc.show_properties() ) assert inlc.sector == sector assert inlc.camera == camera # Run correction: tmplc = inlc.copy() outlc, status = corr.do_correction(tmplc) # Check status assert outlc is not None, "Correction fails" assert isinstance(outlc, TessLightCurve), "Should return TessLightCurve object" assert isinstance(status, corrections.STATUS), "Should return a STATUS object" assert status in (corrections.STATUS.OK, corrections.STATUS.WARNING), "STATUS was not set appropriately" # Print output lightcurve properties: print( outlc.show_properties() ) # Save the lightcurve to FITS file to be tested later on: save_file = corr.save_lightcurve(outlc, output_folder=tmpdir) # Check contents assert len(outlc) == len(inlc), "Input flux ix different length to output flux" assert isinstance(outlc.flux, np.ndarray), "FLUX is not a ndarray" assert isinstance(outlc.flux_err, np.ndarray), "FLUX_ERR is not a ndarray" assert isinstance(outlc.quality, np.ndarray), "QUALITY is not a ndarray" assert outlc.flux.dtype.type is inlc.flux.dtype.type, "FLUX changes dtype" assert outlc.flux_err.dtype.type is inlc.flux_err.dtype.type, "FLUX_ERR changes dtype" assert outlc.quality.dtype.type is inlc.quality.dtype.type, "QUALITY changes dtype" assert outlc.flux.shape == inlc.flux.shape, "FLUX changes shape" assert outlc.flux_err.shape == inlc.flux_err.shape, "FLUX_ERR changes shape" assert outlc.quality.shape == inlc.quality.shape, "QUALITY changes shape" # Plot output lightcurves: fig, (ax1, ax2, ax3) = plt.subplots(3, 1, squeeze=True, figsize=[10, 10]) ax1.plot(inlc.time, inlc.flux, lw=0.5) ax1.set_title(f"{corrector} - Sector {sector:d} - {cadence}s - TIC {starid:d}") if compare_lc: ax2.plot(compare_lc['time'], compare_lc['flux'], label='Compare', lw=0.5) ax3.axhline(0, lw=0.5, ls=':', color='0.7') ax3.plot(outlc.time, outlc.flux - compare_lc['flux'], lw=0.5) ax2.plot(outlc.time, outlc.flux, label='New', lw=0.5) ax1.set_ylabel('Flux [e/s]') ax1.minorticks_on() ax2.set_ylabel('Relative Flux [ppm]') ax2.minorticks_on() ax2.legend() ax3.set_ylabel('New - Compare [ppm]') ax3.set_xlabel('Time [TBJD]') ax3.minorticks_on() fig.savefig(os.path.join(__dir__, f'test-{corrector}-s{sector:04d}-c{cadence:04d}-tic{starid:011d}.png'), bbox_inches='tight') plt.close(fig) # Check things that are allowed to change: assert all(outlc.flux != inlc.flux), "Input and output flux are identical." assert not np.any(np.isinf(outlc.flux)), "FLUX contains Infinite" assert not np.any(np.isinf(outlc.flux_err)), "FLUX_ERR contains Infinite" assert np.sum(np.isnan(outlc.flux)) < 0.5*len(outlc), "More than half the lightcurve is NaN" assert allnan(outlc.flux_err[np.isnan(outlc.flux)]), "FLUX_ERR should be NaN where FLUX is" # TODO: Check that quality hasn't changed in ways that are not allowed: # - Only values defined in CorrectorQualityFlags # - No removal of flags already set assert all(outlc.quality >= 0) assert all(outlc.quality <= 128) assert all(outlc.quality >= inlc.quality) # Things that shouldn't chance from the corrections: assert outlc.targetid == inlc.targetid, "TARGETID has changed" assert outlc.label == inlc.label, "LABEL has changed" assert outlc.sector == inlc.sector, "SECTOR has changed" assert outlc.camera == inlc.camera, "CAMERA has changed" assert outlc.ccd == inlc.ccd, "CCD has changed" assert outlc.quality_bitmask == inlc.quality_bitmask, "QUALITY_BITMASK has changed" assert outlc.ra == inlc.ra, "RA has changed" assert outlc.dec == inlc.dec, "DEC has changed" assert outlc.mission == 'TESS', "MISSION has changed" assert outlc.time_format == 'btjd', "TIME_FORMAT has changed" assert outlc.time_scale == 'tdb', "TIME_SCALE has changed" assert_array_equal(outlc.time, inlc.time, "TIME has changed") assert_array_equal(outlc.timecorr, inlc.timecorr, "TIMECORR has changed") assert_array_equal(outlc.cadenceno, inlc.cadenceno, "CADENCENO has changed") assert_array_equal(outlc.pixel_quality, inlc.pixel_quality, "PIXEL_QUALITY has changed") assert_array_equal(outlc.centroid_col, inlc.centroid_col, "CENTROID_COL has changed") assert_array_equal(outlc.centroid_row, inlc.centroid_row, "CENTROID_ROW has changed") # Check metadata assert tmplc.meta == inlc.meta, "Correction changed METADATA in-place" assert outlc.meta['task'] == inlc.meta['task'], "Metadata is incomplete" assert isinstance(outlc.meta['additional_headers'], fits.Header) # Check performance metrics: #logger.warning("VAR: %e", nanvar(outlc.flux)) if var_goal is not None: var_in = nanvar(inlc.flux) var_out = nanvar(outlc.flux) var_diff = np.abs(var_out - var_goal) / var_goal logger.info("VAR: %f - %f - %f", var_in, var_out, var_diff) assert_array_less(var_diff, 0.05, "VARIANCE changed outside interval") #logger.warning("RMS: %e", rms_timescale(outlc)) if rms_goal is not None: rms_in = rms_timescale(inlc) rms_out = rms_timescale(outlc) rms_diff = np.abs(rms_out - rms_goal) / rms_goal logger.info("RMS: %f - %f - %f", rms_in, rms_out, rms_diff) assert_array_less(rms_diff, 0.05, "RMS changed outside interval") #logger.warning("PTP: %e", ptp(outlc)) if ptp_goal is not None: ptp_in = ptp(inlc) ptp_out = ptp(outlc) ptp_diff = np.abs(ptp_out - ptp_goal) / ptp_goal logger.info("PTP: %f - %f - %f", ptp_in, ptp_out, ptp_diff) assert_array_less(ptp_diff, 0.05, "PTP changed outside interval") # Check FITS file: with fits.open(os.path.join(tmpdir, save_file), mode='readonly') as hdu: # Lightcurve FITS table: fitslc = hdu['LIGHTCURVE'].data hdr = hdu['LIGHTCURVE'].header # Simple checks of header values: assert hdu[0].header['TICID'] == starid # Checks of things in FITS table that should not have changed at all: assert_array_equal(fitslc['TIME'], inlc.time, "FITS: TIME has changed") assert_array_equal(fitslc['TIMECORR'], inlc.timecorr, "FITS: TIMECORR has changed") assert_array_equal(fitslc['CADENCENO'], inlc.cadenceno, "FITS: CADENCENO has changed") assert_array_equal(fitslc['FLUX_RAW'], inlc.flux, "FITS: FLUX_RAW has changed") assert_array_equal(fitslc['FLUX_RAW_ERR'], inlc.flux_err, "FITS: FLUX_RAW_ERR has changed") assert_array_equal(fitslc['MOM_CENTR1'], inlc.centroid_col, "FITS: CENTROID_COL has changed") assert_array_equal(fitslc['MOM_CENTR2'], inlc.centroid_row, "FITS: CENTROID_ROW has changed") # Some things are allowed to change, but still within some requirements: assert all(fitslc['FLUX_CORR'] != inlc.flux), "FITS: Input and output flux are identical." assert np.sum(np.isnan(fitslc['FLUX_CORR'])) < 0.5*len(fitslc['TIME']), "FITS: More than half the lightcurve is NaN" assert allnan(fitslc['FLUX_CORR_ERR'][np.isnan(fitslc['FLUX_CORR'])]), "FITS: FLUX_ERR should be NaN where FLUX is" if corrector == 'ensemble': # Check special headers: assert np.isfinite(hdr['ENS_MED']) and hdr['ENS_MED'] > 0 assert isinstance(hdr['ENS_NUM'], int) and hdr['ENS_NUM'] > 0 assert hdr['ENS_DLIM'] == 1.0 assert hdr['ENS_DREL'] == 10.0 assert hdr['ENS_RLIM'] == 0.4 # Special extension for ensemble: tic = hdu['ENSEMBLE'].data['TIC'] bzeta = hdu['ENSEMBLE'].data['BZETA'] assert len(tic) == len(bzeta) assert len(np.unique(tic)) == len(tic), "TIC numbers in ENSEMBLE table are not unique" assert len(tic) == hdr['ENS_NUM'], "Not the same number of targets in ENSEMBLE table as specified in header" elif corrector == 'cbv': # Check special headers: assert isinstance(hdr['CBV_NUM'], int) and hdr['CBV_NUM'] > 0 # Check coefficients: for k in range(0, hdr['CBV_NUM']+1): assert np.isfinite(hdr['CBV_C%d' % k]) for k in range(1, hdr['CBV_NUM']+1): assert np.isfinite(hdr['CBVS_C%d' % k]) # Check that no other coefficients are present assert 'CBV_C%d' % (hdr['CBV_NUM']+1) not in hdr assert 'CBVS_C%d' % (hdr['CBV_NUM']+1) not in hdr elif corrector == 'kasoc_filter': # Check special headers: assert hdr['KF_POSS'] == 'None' assert np.isfinite(hdr['KF_LONG']) and hdr['KF_LONG'] > 0 assert np.isfinite(hdr['KF_SHORT']) and hdr['KF_SHORT'] > 0 assert hdr['KF_SCLIP'] == 4.5 assert hdr['KF_TCLIP'] == 5.0 assert hdr['KF_TWDTH'] == 1.0 assert hdr['KF_PSMTH'] == 200 assert isinstance(hdr['NUM_PER'], int) and hdr['NUM_PER'] >= 0 for k in range(1, hdr['NUM_PER']+1): assert np.isfinite(hdr['PER_%d' % k]) and hdr['PER_%d' % k] > 0 # Check that no other periods are present assert 'PER_%d' % (hdr['NUM_PER'] + 1) not in hdr # Test that the Gzip FITS file has the correct uncompressed file name, by simply # decompressing the Gzip file, asking to keep the original file name. # This uses the system GZIP utility, since there doesn't seem to be a way to do this # through the Python gzip module: fpath = os.path.join(tmpdir, save_file) fpath_uncompressed = fpath.replace('.fits.gz', '.fits') assert not os.path.exists(fpath_uncompressed), "Uncompressed file already exists" gzip_output = subprocess.check_output(['gzip', '-dkNv', os.path.basename(fpath)], cwd=os.path.dirname(fpath), stderr=subprocess.STDOUT, encoding='utf8') print("Gzip output:") print(gzip_output) assert os.path.isfile(fpath_uncompressed), "Incorrect uncompressed file name" # Just see if we can in fact also open the uncompressed FITS file and get a simple header: with fits.open(fpath_uncompressed, mode='readonly') as hdu: assert hdu[0].header['TICID'] == starid
def stats(self, lmean=False, lmed=False, lskew=False, lvar=False, lstd=False, lcoefvar=False, lperc=False, p=0.95): """Calculate some statistics among every realisation. Each statistic is calculated node-wise along the complete number of realisations. Parameters ---------- lmean : boolean, default False Calculate the mean. lmed : boolean, default False Calculate the median. lskew : boolean, default False Calculate skewness. lvar : boolean, default False Calculate the variance. lstd : boolean, default False Calculate the standard deviation. lcoefvar : boolean, default False Calculate the coefficient of variation. lperc : boolean, default False Calculate the percentile `100 * (1 - p)`. p : number, default 0.95 Probability value. Returns ------- retdict : dict of GridArr Dictionary containing one GridArr for each calculated statistic. See Also -------- stats_area : same but considering a circular (and horizontal) area of a specified radius around a given point. """ # check if the map files are already opened or not if isinstance(self.files[0], file): opened_files = True else: opened_files = False if lmean: meanmap = np.zeros(self.cells) if lmed: medmap = np.zeros(self.cells) if lskew: skewmap = np.zeros(self.cells) if lvar: varmap = np.zeros(self.cells) if lstd: stdmap = np.zeros(self.cells) if lcoefvar: coefvarmap = np.zeros(self.cells) if lperc: percmap = np.zeros((self.cells, 2)) arr = np.zeros(self.nfiles) skip = True offset = os.SEEK_SET for cell in xrange(self.cells - self.header): for i, gridfile in enumerate(self.files): # deal with map files not open yet if opened_files: grid = gridfile else: grid = open(gridfile, 'rb') grid.seek(offset) if skip: skip_lines(grid, self.header) arr[i] = grid.readline() if not opened_files: offset = grid.tell() grid.close() skip = False # replace no data's with NaN bn.replace(arr, self.nodata, np.nan) if lmean: meanmap[cell] = bn.nanmean(arr) if lmed: medmap[cell] = bn.nanmedian(arr) if lskew: skewmap[cell] = pd.Series(arr).skew() if lvar: varmap[cell] = bn.nanvar(arr, ddof=1) if lstd: stdmap[cell] = bn.nanstd(arr, ddof=1) if lcoefvar: if lstd and lmean: coefvarmap[cell] = stdmap[cell] / meanmap[cell] * 100 else: std = bn.nanstd(arr, ddof=1) mean = bn.nanmean(arr) coefvarmap[cell] = std / mean * 100 if lperc: percmap[cell] = pd.Series(arr).quantile([(1 - p) / 2, 1 - (1 - p) / 2]) retdict = dict() if lmean: meangrid = GridArr(name='meanmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=meanmap) retdict['meanmap'] = meangrid if lmed: medgrid = GridArr(name='medianmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=medmap) retdict['medianmap'] = medgrid if lskew: skewgrid = GridArr(name='skewmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=skewmap) retdict['skewmap'] = skewgrid if lvar: vargrid = GridArr(name='varmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=varmap) retdict['varmap'] = vargrid if lstd: stdgrid = GridArr(name='stdmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=stdmap) retdict['stdmap'] = stdgrid if lcoefvar: coefvargrid = GridArr(name='coefvarmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=coefvarmap) retdict['coefvarmap'] = coefvargrid if lperc: percgrid = GridArr(name='percmap', dx=self.dx, dy=self.dy, dz=self.dz, nodata=self.nodata, val=percmap) retdict['percmap'] = percgrid return retdict