def calc_noise_cal_factor(vlt, noise_slc, cal_slc, cal_temp, bandwidth): noise = vlt[:, noise_slc] cal = vlt[:, cal_slc] # assume noise is constant power over time window, robust MAD estimator noise_est = median(noise.real**2 + noise.imag**2)*med_pwr_est_factor # cal temp is constant, so use all cal measurements to estimate power cal_est = median(cal.real**2 + cal.imag**2)*med_pwr_est_factor pwr_factor = calc_power_factor(noise_est, cal_est, cal_temp, bandwidth) return noise_est, cal_est, pwr_factor
def theil_sen(x, y, sample= "auto", n_samples = 1e7): assert x.shape[0] == y.shape[0] n = x.shape[0] if n < 100 or not sample: ix = np.argsort( x ) slopes = np.empty(int(n*(n-1)*0.5)) for c, pair in enumerate(itertools.combinations(range(n), 2)): i,j = ix[pair[0]], ix[pair[1]] slopes[c] = slope(x[i], x[j], y[i], y[j]) else: i1 = np.random.randint(int(0), int(n), int(n_samples)) i2 = np.random.randint(int(0), int(n), int(n_samples)) slopes = slope(x[i1], x[i2], y[i1], y[i2]) slope_ = bottleneck.nanmedian(slopes) #find the optimal b as the median of y_i - slope*x_i intercepts = np.empty(n) for c in range(n): intercepts[c] = y[c] - slope_*x[c] intercept_ = bottleneck.median(intercepts) return np.array([slope_, intercept_])
def moving_nanmedian_cyclic(t, x, w, dt=None): """ Calculate cyclic moving average of input with given window (in t-units) taking into account NaNs in the data. """ if len(t) != len(x): raise ValueError("t and x must have the same length.") if dt is None: dt = median(np.diff(t)) # Calculate width of filter: width_points = int(w / dt) if width_points <= 1: return x if width_points % 2 == 0: width_points += 1 # Filter is much faster when using an odd number of points! wh = width_points // 2 N = len(x) if wh >= N: return np.zeros_like(x) + nanmedian(x) # Stich ends onto the array: xny = np.concatenate((x[-wh - 1:N - 1], x, x[1:wh + 1])) # Run moving median on longer series: N = len(xny) y = _median_central(xny, width_points) # Cut out the central part again: y = y[wh:N - wh] return y
def theil_sen(x, y, n_samples=1e5): """Computes the Theil-Sen estimator for 2d data parameters: x: 1-d np array, the control variate y: 1-d np.array, the ind variate. n_samples: how many points to sample. This complexity is O(n**2), which can be poor for large n. We will perform a sampling of data points to get an unbiased, but larger variance estimator. The sampling will be done by picking two points at random, and computing the slope, up to n_samples times. """ assert x.shape[0] == y.shape[0], "x and y must be the same shape." n = x.shape[0] i1 = np.random.randint(0, n, n_samples) i2 = np.random.randint(0, n, n_samples) slopes = _slope(x[i1], x[i2], y[i1], y[i2]) slope_ = nanmedian(slopes) #find the optimal b as the median of y_i - slope*x_i intercepts = np.empty(n, dtype=float) for i in range(n): intercepts[i] = y[i] - slope_ * x[i] intercept_ = median(intercepts) return np.array([slope_, intercept_])
def simple_sky(sky): skymed = bt.median(sky) skymean = bt.nanmean(sky) skymod = 3. * skymed - 2. * skymean skystd = bt.nanstd(sky) return skymod, skystd, len(sky)
def moving_nanmedian(t, x, w, dt=None): """Calculate moving median of input with given window (in t-units)""" assert len(t) == len(x), "t and x must have the same length." if dt is None: dt = median(diff(t)) width_points = int(w / dt) if width_points <= 1: return x if width_points % 2 == 0: width_points += 1 if width_points >= len(x): return zeros_like(x) + nanmedian(x) return _median_central(x, width_points)
def theil_sen(x,y, sample= "auto", n_samples = 1e7): """ Computes the Theil-Sen estimator for 2d data. parameters: x: 1-d np array, the control variate y: 1-d np.array, the ind variate. sample: if n>100, the performance can be worse, so we sample n_samples. Set to False to not sample. n_samples: how many points to sample. This complexity is O(n**2), which can be poor for large n. We will perform a sampling of data points to get an unbiased, but larger variance estimator. The sampling will be done by picking two points at random, and computing the slope, up to n_samples times. """ assert x.shape[0] == y.shape[0], "x and y must be the same shape." n = x.shape[0] if n < 100 or not sample: ix = np.argsort( x ) slopes = np.empty( n*(n-1)*0.5 ) for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( i,j = ix[pair[0]], ix[pair[1]] slopes[c] = slope( x[i], x[j], y[i],y[j] ) else: i1 = np.random.randint(0, n, n_samples) i2 = np.random.randint(0, n, n_samples) print '...checking for unwanted zeros...' zero_check=np.where(np.abs((x[i1]-x[i2])) != 0) i1=i1[zero_check] i2=i2[zero_check] print '...calculating slopes...' slopes = slope( x[i1], x[i2], y[i1], y[i2] ) print 'slope min and max are:',np.amin(slopes),np.amax(slopes) histogram,bin_limits=np.histogram(slopes,bins=10000,range=(-2,2)) #print histogram #c95=np.percentile(slopes,(5,95)) #pdb.set_trace() slope_ = bottleneck.nanmedian( slopes ) print '...done! Now finding intercepts...' #find the optimal b as the median of y_i - slope*x_i intercepts = np.empty( n ) for c in xrange(n): intercepts[c] = y[c] - slope_*x[c] histogram_i,bin_limits_i=np.histogram(intercepts,bins=10000,range=(-2,2)) #print histogram_i #c95i=np.percentile(intercepts,(5,95)) #print cumul_i intercept_ = bottleneck.median( intercepts ) return np.array( [slope_,intercept_]) #c95[0],c95[1],c95i[0],c95i[1]] )
def gap_fill(t, y, maxgap=np.inf): # Declare variables used: times_max = 0 D = np.diff(t) time_tot = list([]) data_tot = list([]) ori_or_not = list([]) # Calculate the desired regular step size: step = median(D) stepcut = 1.5 * step # test: if not np.isinf(maxgap): times_max = int((maxgap / 2) / step) + 1 for i in range(len(t) - 1): # Add the original point: time_tot.append(t[i]) data_tot.append(y[i]) ori_or_not.append(1) d = D[i] if d > maxgap: # Insert half the maximum number of points in the beginning and end of gap: for j in range(1, times_max): time_tot.append(t[i] + j * step) data_tot.append(np.NaN) ori_or_not.append(0) # Insert half the maximum number of points in the beginning and end of gap: for j in range(times_max, 0, -1): time_tot.append(t[i + 1] - j * step) data_tot.append(np.NaN) ori_or_not.append(0) elif d > stepcut: # Calculate the number of points to be inserted and insert them: times = int(d / step) - 1 for j in range(times): time_tot.append(t[i] + (j + 1) * step) data_tot.append(np.NaN) ori_or_not.append(0) # Special treatment of last point: time_tot.append(t[-1]) data_tot.append(y[-1]) ori_or_not.append(1) return np.array(time_tot), np.array(data_tot), np.array(ori_or_not, dtype=bool)
def test_median(size): global data, mask, datamasked, list_of_data, list_of_mask print('Generate fake data') shape = (10, size, size) np.random.seed(42) data = np.random.normal(size=shape).astype(np.float32) mask = np.zeros(shape, dtype=np.uint16) data = data.reshape(shape[0], -1) mask = mask.reshape(shape[0], -1) datamasked = np.ma.array(data, mask=mask.astype(bool)) list_of_data = list(data) list_of_mask = list(mask) print('Check results') outndcomb, _, _ = ndcombine(list_of_data, list_of_mask, combine_method='median', reject_method='none') outnp = np.median(data, axis=0) outnpma = np.ma.median(datamasked, axis=0) outbn = bn.median(data, axis=0) np.testing.assert_array_equal(outndcomb, outnp) np.testing.assert_array_equal(outndcomb, outnpma) np.testing.assert_array_equal(outndcomb, outbn) print('Run perf tests') nb = 10 kwargs = dict(globals=globals(), number=nb, repeat=5) def run(label, command): res = timeit.repeat(command, **kwargs) res = np.array(res) / nb print(f'- {label:20s}: {np.mean(res):.3f}s ± {np.std(res):.3f}s') run('np.median', 'np.median(data)') run('np.ma.median', 'np.ma.median(datamasked, axis=0)') run('bn.median', 'bn.median(data)') run( "ndcombine 1 thread", "ndcombine(list_of_data, list_of_mask, combine_method='median', " "reject_method='none', num_threads=1)") run( "ndcombine", "ndcombine(list_of_data, list_of_mask, combine_method='median', " "reject_method='none')")
def retrieve_phred_non_param(nbins,ratio,data,ratio_hist): pvals=[] n=10000 p=0 sampled_chromosomes=[] #pick chromosomes big enough to sample from for chromosome in data["chromosomes"]: if len(data[chromosome]["ratio"]) > 10*nbins and not "X" in chromosome and not "Y" in chromosome and abs(ratio_hist[chromosome][0]-1) < 0.1: sampled_chromosomes.append(chromosome) if not sampled_chromosomes: return(int(-10*math.log10(1/float(n)))) chromosomes=list(sorted(numpy.random.choice(sampled_chromosomes,size=n))) simulated_positions=[] #simulate for chromosome in sorted(sampled_chromosomes): simulated_positions+=list(numpy.random.randint(0,high=len(data[chromosome]["ratio"])-nbins,size=chromosomes.count(chromosome))) failed=0 for i in range(0,n): chromosome=chromosomes[i] pos=simulated_positions[i] sim_bins=data[chromosome]["ratio"][pos:pos+nbins] if list(sim_bins).count(-1)/float(len(sim_bins)) >= 0.6: failed+=1 continue sim_ratio=bottleneck.median(sim_bins[numpy.where(sim_bins >= 0)],axis=0) if ratio > 1 and sim_ratio >= ratio: p+=1 elif ratio < 1 and sim_ratio <= ratio: p +=1 if failed == n: return 1000 p=p/float(n-failed) if not p: return(int(-10*math.log10(1/float(n-failed)))) #normalise between 1000 and 1 phred=int(-10*math.log10(p)) return(phred)
def fit_storage(self, data): #X, Y, W=None): X, Y, W = data.X, data.Y, data.W if data.W else None Y = Y.astype(dtype=int) np.random.seed(self.seed) evd = {} for cls in range(len(self.domain.class_var.values)): self.target_class = cls print("estimating evd for class", cls) # repeat n-times max_vals = defaultdict(list) for i in range(self.n): print("{}/{}".format(i, self.n)) # randomize class Yr = np.array(Y) np.random.shuffle(Yr) # learn rules new_data = Table.from_table(data.domain, data) new_data.Y = Yr super().fit_storage(new_data) for k in range(self.max_rule_length): ki = k if k < len(self.inter_rules) else -1 max_vals[k + 1].extend( [r.quality for r in self.inter_rules[ki]]) # calculate extreme value distributions evd_cls = {0: EVDDist(0, 1, 0)} prev_median = 0 for k in range(1, self.max_rule_length + 1): median = max(prev_median, bn.median(max_vals[k])) print("med", median) prev_median = median beta = 2 mu = median + beta * np.log(np.log(2)) if mu > 0.1: evd_cls[k] = EVDDist(mu, beta, median) else: evd_cls[k] = EVDDist(0, 1, 0) evd[cls] = evd_cls print() # returns an empty classifier return EVDFitterClassifier(evd, self.domain)
def _median_with_nan(values, *args, **kwargs): """ replace "median" if skipna is False numpy's median ignore NaNs as long as less than 50% modify this behaviour and return NaN just as any other operation would """ if _hasbottleneck: result = bottleneck.median(values, *args, **kwargs) else: result = np.median(values, *args, **kwargs) if anynan(values): if np.size(result) == 1: result = np.nan else: axis = kwargs.pop('axis', None) nans = anynan(values, axis=axis) # determine where the nans should be result[nans] = np.nan return result
def theil_sen(x,y, sample= "auto", n_samples = 1e7): """ Computes the Theil-Sen estimator for 2d data. parameters: x: 1-d np array, the control variate y: 1-d np.array, the ind variate. sample: if n>100, the performance can be worse, so we sample n_samples. Set to False to not sample. n_samples: how many points to sample. This complexity is O(n**2), which can be poor for large n. We will perform a sampling of data points to get an unbiased, but larger variance estimator. The sampling will be done by picking two points at random, and computing the slope, up to n_samples times. """ assert x.shape[0] == y.shape[0], "x and y must be the same shape." n = x.shape[0] if n < 100 or not sample: ix = np.argsort( x ) slopes = np.empty( n*(n-1)*0.5 ) for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( i,j = ix[pair[0]], ix[pair[1]] slopes[c] = slope( x[i], x[j], y[i],y[j] ) else: i1 = np.random.randint(0, n, n_samples) i2 = np.random.randint(0, n, n_samples) slopes = slope( x[i1], x[i2], y[i1], y[i2] ) #pdb.set_trace() slope_ = bottleneck.nanmedian( slopes ) #find the optimal b as the median of y_i - slope*x_i intercepts = np.empty( n ) for c in xrange(n): intercepts[c] = y[c] - slope_*x[c] intercept_ = bottleneck.median( intercepts ) return np.array( [slope_, intercept_] )
def theil_sen(x, y, sample="auto", n_samples=1e7): """ Computes the Theil-Sen estimator for 2d data. parameters: x: 1-d np array, the control variate y: 1-d np.array, the ind variate. sample: if n>100, the performance can be worse, so we sample n_samples. Set to False to not sample. n_samples: how many points to sample. This complexity is O(n**2), which can be poor for large n. We will perform a sampling of data points to get an unbiased, but larger variance estimator. The sampling will be done by picking two points at random, and computing the slope, up to n_samples times. """ assert x.shape[0] == y.shape[0], "x and y must be the same shape." n = x.shape[0] if n < 100 or not sample: ix = np.argsort(x) slopes = np.empty(n * (n - 1) * 0.5) for c, pair in enumerate(itertools.combinations(range(n), 2)): # it creates range(n) =( i, j = ix[pair[0]], ix[pair[1]] slopes[c] = slope(x[i], x[j], y[i], y[j]) else: i1 = np.random.randint(0, n, n_samples) i2 = np.random.randint(0, n, n_samples) slopes = slope(x[i1], x[i2], y[i1], y[i2]) # pdb.set_trace() slope_ = bottleneck.nanmedian(slopes) # find the optimal b as the median of y_i - slope*x_i intercepts = np.empty(n) for c in xrange(n): intercepts[c] = y[c] - slope_ * x[c] intercept_ = bottleneck.median(intercepts) return np.array([slope_, intercept_])
def compute_var_genes(adata, return_vect=True): """Compute variable genes for an indiviudal dataset Arguments: adata {[type]} -- AnnData object containing a signle dataset Keyword Arguments: return_vect {bool} -- Boolean to store as adata.var['higly_variance'] or return vector of booleans for varianble gene membership (default: {False}) Returns: np.ndarray -- None if saving in adata.var['highly_variable'], array of booleans if returning of length ngenes """ if sparse.issparse(adata.X): median = csc_median_axis_0(sparse.csc_matrix(adata.X)) variance = np.var(adata.X.A, axis=0) else: median = bottleneck.median(adata.X, axis=0) variance = np.var(adata.X, axis=0) bins = np.quantile(median, q=np.linspace(0, 1, 11), interpolation="midpoint") digits = np.digitize(median, bins, right=True) selected_genes = np.zeros_like(digits) for i in np.unique(digits): filt = digits == i var_tmp = variance[filt] bins_tmp = np.nanquantile(var_tmp, q=np.linspace(0, 1, 5)) g = np.digitize(var_tmp, bins_tmp) selected_genes[filt] = (g >= 4).astype(float) if return_vect: return selected_genes.astype(bool) else: adata.var["highly_variable"] = selected_genes.astype(bool)
def random_epstein(values, k): S = set(values) #print len(S),len(values) Aleft = weighted_average(S) Aright = float('inf') while len(S) > 1: sampled = sample(S, 1)[0] vi = sampled[0] wi = sampled[1] E = set() X = set() Y = set() Z = set() for (vj, wj) in S: if wj == wi: delta_i_j = vj - vi A_i_j = float('-inf') else: delta_i_j = wi - wj A_i_j = (vi - vj) / (wi - wj) if delta_i_j == 0: E |= {(vj, wj)} elif (A_i_j <= Aleft and delta_i_j > 0) or (A_i_j >= Aright and delta_i_j < 0): X |= {(vj, wj)} elif (A_i_j <= Aleft and delta_i_j < 0) or (A_i_j >= Aright and delta_i_j > 0): Y |= {(vj, wj)} Z = S - X - Y - E n = len(S) # print '-----------' # print 'X = ',X # print 'Y = ',Y # print 'Z = ',Z # print 'E = ',E # print 'S = ',S # print '-----------' #raw_input('***************') while True: if len(Z) > 0: #A=sorted([(vi-vj)/(wi-wj) if wi!=wj else float('-inf') for (vj,wj) in Z])[len(Z)/2] A = median([ (vi - vj) / (wi - wj) if wi != wj else float('-inf') for (vj, wj) in Z ]) l = sorted([f((vj, wj), A) for (vj, wj) in S], reverse=True)[:len(S) - k] #print l #raw_input('...') F_A = sum(l[:len(S) - k]) if F_A == 0: return A elif F_A > 0: Aleft = A else: Aright = A #print [Aleft,Aright] #####################RECOMPUTE X,Y,Z##################### to_remove_from_z = set() for (vj, wj) in Z: delta_i_j = wi - wj A_i_j = (vi - vj) / (wi - wj) if (A_i_j <= Aleft and delta_i_j > 0) or (A_i_j >= Aright and delta_i_j < 0): X |= {(vj, wj)} to_remove_from_z |= {(vj, wj)} elif (A_i_j <= Aleft and delta_i_j < 0) or (A_i_j >= Aright and delta_i_j > 0): Y |= {(vj, wj)} to_remove_from_z |= {(vj, wj)} Z = Z - to_remove_from_z #####################RECOMPUTE X,Y,Z##################### #print 'X = ',len(X),'Y = ',len(Y),'Z = ',len(Z),'E = ',len(E),'S = ',len(S) #raw_input('....') if ((len(X) + len(E)) >= (len(S) - k)) and k > 0: nb_to_remove = min(len(E), len(X) + len(E) - (len(S) - k)) #print nb_to_remove,k,len(E) to_remove_E = set(sample(E, nb_to_remove)) #print len(S) S = S - to_remove_E E = E - to_remove_E S = S - Y #print len(S) k = k - (len(Y) + nb_to_remove) Y = set() # print len(E),len(S) # raw_input('ooooo') # if k==0: # return weighted_average(S) elif (len(Y) + len(E)) >= k: nb_to_collapse = min(len(E), len(Y) + len(E) - k) values_to_collapse_E = set(sample(E, nb_to_collapse)) E = E - values_to_collapse_E values_to_collapse = values_to_collapse_E | X S = S - values_to_collapse collapsed = (sum(x[0] for x in values_to_collapse), sum(x[1] for x in values_to_collapse)) X = {collapsed} S = S | {collapsed} if len(Z) <= len(S) / 32: break spop = S.pop() #print 'hey ! ' return spop[0] / spop[1]
def filter_position_1d(time, flux, star_movement, timescale_position_smooth=None, dt=None): """Filter the lightcurve for correlations in the stars position on the CCD.""" # Check input: assert len(time)==len(flux), "TIME and FLUX should have the same number of elements." if not timescale_position_smooth is None and dt is None: dt = median(diff(time)) # Settings: # num_knots = 15 # min_points_per_knot = 3 # spline_degree = 2 # sigma_clip_spline = 4.0 # Build up xpos chunk by chunk of the timeseries: xpos = np.empty_like(time, dtype='float64') for chk,chunk in enumerate(star_movement['chunks']): # Extract needed information: cl = star_movement['curvelength'][chk] # Sorted in position indx_possort = star_movement['indx_possort'][chk] indx_timesort = star_movement['indx_timesort'][chk] # Create smooth curve as flux as a function of curvelength: # The resulting "xp" will be sorted by position fl = flux[chunk][indx_possort] """indx_finite = isfinite(cl) & isfinite(fl) knots = spline_set_knots(cl[indx_finite], num_knots) # Create the fixed knots for the spline function: knots = np.linspace(nanmin(cl[indx_finite]), nanmax(cl[indx_finite]), num_knots+2)[1:-2] # Remove knots if there is not at least 3 points between them: newknots = array([], dtype='float64') for i in range(len(knots)-1): indx_data_between_knots = (knots[i] < cl[indx_finite]) & (cl[indx_finite] < knots[i+1]) if sum(indx_data_between_knots) > min_points_per_knot: newknots = append(newknots, knots[i]) knots = newknots # Do a spline where all points are given the same weight: spline = LSQUnivariateSpline(cl[indx_finite], fl[indx_finite], knots, w=None, k=spline_degree) # Begin iterating so we can change the weights: for iterations in range(2): # Calculate weight of points based of their distance to # the previously calculated spline: d = np.abs( fl[indx_finite] - spline(cl[indx_finite]) ) s = mad_to_sigma * median(d) w = 0.5*(np.sign(sigma_clip_spline - d/s) + 1) # Heaviside cutoff-function # Recalculate the spline, using the weights: spline = LSQUnivariateSpline(cl[indx_finite], fl[indx_finite], knots, w=w, k=spline_degree) # Evaluate the spline function at the curvelengths of the datapoints: # The spline function will return NaN if passed a NaN xp = spline(cl) """ lowess_frac = 0.1/ (nanmax(cl[np.isfinite(fl)]) - nanmin(cl[np.isfinite(fl)])) xp = lowess(fl, cl, frac=lowess_frac, it=3, is_sorted=True, return_sorted=False) # Sort back into time-sorting and put NaN's back, # then low-pass filter the result: if timescale_position_smooth is None: xpos[chunk] = xp[indx_timesort] else: xpos[chunk] = moving_nanmedian(time[chunk], xp[indx_timesort], timescale_position_smooth, dt=dt) # Return the final time-sorted series: return xpos
def filter(t, x, quality=None, position=None, P=None, jumps=None, timescale_long=3.0, timescale_short=1/24, sigma_clip=4.5, scale_clip=5.0, scale_width=1.0, phase_smooth_factor=1000, transit_model=None, it=3): """Main filter function. Parameters: t (ndarray): Time vector (days). x (ndarray): Flux vector. quality (ndarray, None): Quality vector (bit-flags) from Kepler data; default=None. position (ndarray, None): Centroid positions of star on CCD as two column list; default=None. P (ndarray): Known planetary period (days); default=None. jumps (list): List of known jumps in the flux (timestamp in days); default=None. timescale_long (float): Timescale of long filter in days; default=3. timescale_short (float): Timescale of short filter in days; default=1/24. sigma_clip (float): Sigma-clip threshold; default=4.5. scale_clip (float): Scale at which to switch between long and short filters; default=5. scale_width (float): Width of transition region between filters; default=1. phase_smooth_factor (float): Fraction of period to smooth phase curce with; default=1000. transit_model (ndarray): Full transit model to be used instead of smoothed phase curve; default=None. it (integer): Number of iterations between different filters. Default=3. Returns: tnew - New time vector with the same length as the input vectors. xnew - New flux vector with the same length as the input vectors. sigma - Vector of estmated errors on measurements. flags - Vector of KASOC flags. filt - Vector with the final filter applied (after jump removal). turnover - Turnover function with weights to long and short filter. """ # Basic check of input: N = len(t) assert N==len(x), "TIME and DATA does not have the same length" if not transit_model is None: assert N==len(transit_model), "TRANSIT_MODEL is wrong length" if not quality is None: assert N==len(quality), "QUALITY is wrong length" if not position is None: if not isinstance(position, dict): position = {'pixels': position, 'break': np.array([], dtype='float64')} assert position['pixels'].shape==(N, 2), "POSITION must have the shape (N,2)" assert it > 0, "IT must be at least one." # Get the logger to use for printing messages: logger = logging.getLogger(__name__) # Sort the data in ascending order of time (This is needed for median filters to work) indx_sorttime = argsort(t) x = x[indx_sorttime] # data sorted after time t = t[indx_sorttime] # sorted time if not quality is None: quality = quality[indx_sorttime] # sorted quality if not position is None: position['pixels'] = position['pixels'][indx_sorttime, :] # sorted position # If not correcting position and transits, don't iterate: if position is None and transit_model is None and P is None: it = 1 # Find median cadence: dt = median(diff(t)) # Use the quality values to filter out bad values: if not quality is None: x, tmpJumps, flag_removed = filter_flags(t, x, quality, return_flags=True) if len(tmpJumps) > 0: if jumps is None: jumps = tmpJumps else: jumps = append(jumps, tmpJumps) else: flag_removed = ~isfinite(x) # Remove jumps: if not jumps is None: logger.info('Removing jumps...') x, jumps_flag, flag_jumps2 = remove_jumps(t, x, jumps, return_flags=True) # Fill gaps in timeseries with NaN # "ori" is a flag so xg[ori] will retrive the original points logger.info('Filling gaps...') tg, xg, ori = gap_fill(t, x, timescale_long) Ng = len(tg) # Calculate wide median filter and possibly filter out # flux changes correlated with stars position on CCD: if not position is None: logger.info('Extracting position information...') # Remove points that have been flagged as bad from positions: position['pixels'][flag_removed, :] = NaN # Fill the gaps in the position timeseries with NaNs: posg = empty((Ng, 2), dtype='float64'); posg.fill(NaN) posg[ori, :] = position['pixels'] position['pixels'] = posg # Run subroutine which determines xlong and xpos using the positions: flag_bad_pos, star_movement = extract_star_movement_1d(tg, xg, position, dt=dt) # Number of columns to plot on the "decorrelation" plot: # NOTE: Not "+2" as Nchunks is the number of breaks and not the number of chunks ncols = star_movement['Nchunks'] + 1 else: flag_bad_pos = zeros(Ng, dtype='bool') ncols = 1 flux_ylim = np.percentile(x[isfinite(x)], [0.25, 99.75]) # Prepare the "decorrelation" figure: ax1 = ax2 = None figsize = [8*1.7, 6*1.7] figsize[0] = figsize[0] * max(ncols/3, 1) figsize[1] = figsize[1] * max(it/3, 1) fig = plt.figure(figsize=figsize) fig.canvas.set_window_title('Decorrelation') fig.subplots_adjust(hspace=0.05) # Repeat the determination of xlong and xpos to better disentangle them: xpos = zeros(Ng, dtype='float64') xtransit = zeros(Ng, dtype='float64') xpos[flag_bad_pos] = NaN # Set points found to be bad to NaN so they wont contribute in the following for i in range(it): logger.info("Running %d iteration:", i+1) # Create long moving median, by removing previously found xpos and xtransit: logger.info(' Calculating long moving median...') xinp = xg - xpos - xtransit xlong = moving_nanmedian(tg, xinp, timescale_long, dt=dt) xlong[flag_bad_pos] = NaN # Create first column of plot with determination of xlong: ax1 = fig.add_subplot(it, ncols, ncols*i+1, sharex=ax1) ax1.scatter(tg, xinp, color='k', s=1, alpha=0.5) ax1.plot(tg, xlong, 'g-') ax1.set_xlim(tg[0], tg[-1]) ax1.set_ylim(flux_ylim) ax1.set_ylabel(r'Flux (e$^-$/s)') plt.yticks(fontsize=10) ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) if i==0: ax1.set_title(r'$x_\mathrm{long}$') if i == it-1: ax1.set_xlabel('Time (days)', fontsize=10) plt.xticks(fontsize=10) else: plt.setp(ax1.get_xticklabels(), visible=False) # Filter the timeseries for the star movement: if not position is None: logger.info(' Filtering star movements...') xinp = xg - xlong - xtransit xpos = filter_position_1d(tg, xinp, star_movement, dt=dt) for kc,chunk in enumerate(star_movement['chunks']): indx_possort = star_movement['indx_possort'][kc] curvelength_chunk = star_movement['curvelength'][kc] ax2 = fig.add_subplot(it, ncols, ncols*i+kc+2) ax2.scatter(curvelength_chunk, xinp[chunk][indx_possort], color='k', s=1, alpha=0.5) ax2.plot(curvelength_chunk, xpos[chunk][indx_possort], 'r-') plt.yticks(fontsize=10) if i==0: ax2.set_title('Position-flux #%d'%(kc+1)) if i==it-1: ax2.set_xlabel('Curve length (pixels)', fontsize=10) plt.xticks(fontsize=10) else: plt.setp(ax2.get_xticklabels(), visible=False) # The next column with xpos as a function of time: ax3 = fig.add_subplot(it, ncols, ncols*(i+1), sharex=ax1) ax3.scatter(tg, xinp, color='k', s=1, alpha=0.5) ax3.plot(tg, xpos, 'r-') ax3.set_xlim(tg[0], tg[-1]) plt.yticks(fontsize=10) ax3.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) if i == 0: ax3.set_title(r'$x_\mathrm{pos}$') if i == it-1: ax3.set_xlabel('Time (days)', fontsize=10) plt.xticks(fontsize=10) else: plt.setp(ax3.get_xticklabels(), visible=False) # Calculate phase-curve, if periods are provided: if not P is None: logger.info(" Calculating phase curve...") xtransit = filter_phase(tg, xg-xlong-xpos, P, smooth_factor=phase_smooth_factor) elif not transit_model is None: # Create filter using transit model # Do it in this way since transit model is relative with respect to 1 # Fill gaps of transit model the same way as the data: xtransit = np.ones(Ng) xtransit[ori] = transit_model filt = (xlong+xpos) * xtransit xtransit = filt - (xlong+xpos) # Save the figure: if _output_folder is not None: fig.savefig(os.path.join(_output_folder, _output_prefix+'decorrelation.'+_output_format), format=_output_format, bbox_inches='tight') if _output_format != 'native': plt.close(fig) # Make sure we have removed the bad datapoints: xg[flag_bad_pos] = NaN # Construct the final filter: filt = xlong + xtransit + xpos # Run the old KASOC filter to remove any potential unknown transits and sharp features: if not timescale_short is None: # Make a switch for long cadence data that puts a lower limit on timescale_short of 7 points (3.5 hours for LC) if timescale_short < 7*dt: logger.warning("WARNING: timescale_short is less than 7 points wide!") # Smooth the data with short moving median: logger.info("Calculating short moving median...") xshort = moving_nanmedian(tg, xg-filt, timescale_short, dt=dt) xshort_tilde = dc(xshort) xshort = filt + xshort # Create timeseries of the long filter, divided by the short filter: w4 = filt/xshort - 1 # Smooth the timeseries using a very short filter to remove any very high frequency noise: w4_smooth_width = int(timescale_short/dt) w4 = smooth(w4, w4_smooth_width) w4 = smooth(w4, w4_smooth_width) w4 = smooth(w4, w4_smooth_width) # Calculate moving standard deviation of timeseries # in units of sigmas: w5 = moving_nanmedian(tg, np.abs(w4), timescale_short) snr = w5/nanmedian(w5) # Create "flag"/weight indicating how much of the short filter and the long filter should # be used at each timestep. Is a number between 0 (long filter) and 1 (short filter). if scale_width > 0: turnover = norm.cdf(snr, scale_clip, scale_width) else: # For zero width, use the Heaviside function: turnover = 0.5*(np.sign(snr-scale_clip) + 1) # Create final filter as weighted mean of the long and short filters: filt = (1-turnover)*filt + turnover*xshort # Plot the derived filter compoments: if not _output_folder is None: fig = plt.figure() # num='turnover' fig.canvas.set_window_title('turnover') fig.subplots_adjust(hspace=0.05) ax1 = plt.subplot(211) ax1.axhspan(scale_clip-scale_width, scale_clip+scale_width, facecolor='0.5', edgecolor=None, alpha=0.5) ax1.plot(tg, snr, 'b-') ax1.set_ylabel(r'$\sigma_w$', fontsize=10) ax1.set_title('Filter turnover function', fontsize=12) ax1.set_xlim(t[0], t[-1]) ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) plt.yticks(fontsize=10) plt.setp(ax1.get_xticklabels(), visible=False) # Axes showing the derived weights: ax2 = plt.subplot(212, sharex=ax1) ax2.plot(tg, turnover, 'b-') ax2.set_ylim(0, 1) ax2.set_ylabel('$c$', fontsize=10) ax2.set_xlabel('Time', fontsize=10) ax2.set_xlim(t[0], t[-1]) ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) plt.xticks(fontsize=10) plt.yticks(fontsize=10) if _output_format != 'native': fig.savefig(os.path.join(_output_folder, _output_prefix+'turnover.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig) else: xshort = zeros(Ng) xshort_tilde = xshort turnover = zeros(Ng) # Flag with significant sharp and negative features (transits?): flag_transit = (turnover > 0.5) & (xshort < xlong+xpos+xtransit) # Plot the final filter: if not _output_folder is None: mask_long = isfinite(xlong) mask_short = isfinite(xshort) mask_filt = isfinite(filt) fig = plt.figure() fig.canvas.set_window_title('components') ax = fig.add_subplot(111) h1 = plt.scatter(t, x, color='k', s=2) h2, = plt.plot(tg[mask_long], xlong[mask_long], 'b-') h5, = plt.plot(tg, xlong+xpos+xtransit, 'y-') h3, = plt.plot(tg[mask_short], xshort[mask_short], 'g-') h4, = plt.plot(tg[mask_filt], filt[mask_filt], 'r-') ax.plot(tg[flag_transit], xg[flag_transit], 'go', markersize=2) plt.legend([h1, h2, h5, h3, h4], ['Data', r'$x_{\rm long}$', r'$x_{\rm pos}+x_{\rm transit}$', r'$x_{\rm short}$', 'Final filter'], fontsize=8, ncol=2, loc='best') ax.set_xlabel('Time', fontsize=10) ax.set_ylabel('Flux', fontsize=10) ax.set_xlim(t[0], t[-1]) ax.set_ylim(flux_ylim) plt.xticks(fontsize=10) plt.yticks(fontsize=10) ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) if _output_format != 'native': fig.savefig(os.path.join(_output_folder, _output_prefix+'components.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig) # Apply final filter and convert to ppm: xg = 1e6*(xg/filt - 1) # Remove outliers using sigma-clipping: # The mean is already taken out, so we only # need to calculate the deviation from zero. logger.info("Calculating sigma...") flag_bad = array([False]*Ng, dtype='bool') absx = np.abs(xg) sigma = moving_nanmedian(tg, absx, timescale_long, dt=dt) if not sigma_clip is None: sigma_clip = mad_to_sigma * sigma_clip # less expensive to convert sigma_clip than sigma vector # 9. Estimate the point-to-point error from final timeseries: # We need to re-do it because bad data points might have biases # the previously calculculated sigmas flag_bad = (absx > sigma_clip*sigma) ############################# while True: flag_rem = (absx > sigma_clip*sigma) if flag_rem.any(): # Remove bad data points from timeseries: flag_bad[flag_rem] = True absx[flag_rem] = NaN sigma = moving_nanmedian(tg, absx, timescale_long, dt=dt) else: break ############################# # Bad data points should also be NaN: xg[flag_bad] = NaN # Convert to proper sigma indsted of MAD: indx = ~isfinite(xg) sigma[indx] = NaN sigma = mad_to_sigma * smooth(sigma, int(timescale_long/dt)) sigma[indx] = NaN # Return results: # Remove the gap-filled data again: x = xg[ori] sigma = sigma[ori] filt = filt[ori] flag_bad = flag_bad[ori] turnover = turnover[ori] flag_transit = flag_transit[ori] flag_bad_pos = flag_bad_pos[ori] xlong = xlong[ori] xpos = xpos[ori] xtransit = xtransit[ori] # Return this instead of xshort, so the filter is easier to # "disacemble" into the components, since this means that the # filter can be written as: # filter = xlong + xpos + xtransit + xshort xshort = turnover * xshort_tilde[ori] # Create KASOC flag vector: quality_flags = zeros(N, dtype='int64') quality_flags[flag_removed] += 1 if not jumps is None: quality_flags += flag_jumps2 quality_flags[flag_bad] += 8 quality_flags[flag_transit] += 16 if not position is None: quality_flags[flag_bad_pos] += 32 # Find the indicies of points just after position breaks: if len(star_movement['tbreaks']) >= 3: ibreak = searchsorted(t, star_movement['tbreaks'][1:-1]) quality_flags[ibreak] += 64 # Check that the extracted errorbars make sense: indx_invalid_sigma = (sigma < 1e-8) #indx_invalid_sigma = (sigma < 0.01*nanmedian(sigma)) #nms = nanmedian(sigma) #fig = plt.figure() #ax = fig_addsubplot(111) #ax.plot(t, sigma, 'b-') #ax.axhline(0.01*nms, color='k', ls='--') #ax.axhline(0.05*nms, color='k', ls='--') #ax.set_ylabel(r'$\sigma$ (ppm)', fontsize=10) #ax.set_xlabel('Time', fontsize=10) #plt.close(fig) if np.any(indx_invalid_sigma): # Generate a warning message: number_invalid_sigma = np.sum(indx_invalid_sigma) try: logger.warning("Invalid SIGMAs extracted (%d points = %.2f%%). Timescales should maybe be adjusted.", number_invalid_sigma, 100*number_invalid_sigma/N) warnings.warn("Invalid SIGMAs extracted", InvalidSigmasWarning) except IOError: print("Something went wrong in the logging of invalid sigmas") # Set the timeseries to NaN where sigmas are invalid, # and add a flag (128) to the quality-flags: x[indx_invalid_sigma] = NaN sigma[indx_invalid_sigma] = NaN quality_flags[indx_invalid_sigma] += 128 # Plot the final filtered timeseries: if not _output_folder is None: fig = plt.figure() fig.canvas.set_window_title('final filter') fig.subplots_adjust(hspace=0.05) ax1 = plt.subplot(211) ax1.plot(t, x, 'b.', markersize=2) ax1.set_xlim(t[0], t[-1]) ax1.set_ylabel('Relative flux (ppm)', fontsize=10) ax1.set_title("Final timeseries", fontsize=12) ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) plt.setp(ax1.get_xticklabels(), visible=False) plt.yticks(fontsize=10) ax2 = plt.subplot(212, sharex=ax1) ax2.plot(t, sigma, 'b-') ax2.set_ylabel(r'$\sigma$ (ppm)', fontsize=10) ax2.set_xlabel('Time', fontsize=10) ax2.set_xlim(t[0], t[-1]) plt.xticks(fontsize=10) plt.yticks(fontsize=10) ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) if _output_format != 'native': fig.savefig(os.path.join(_output_folder, _output_prefix+'final.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig) # Return everything needed: return t, x, sigma, quality_flags, filt, turnover, xlong, xpos, xtransit, xshort
def filter_phase(t, x, Plist, smooth_factor=1000): """ Filter out specific periods by smoothing the phase-curve. Parameters: t (ndarray): Time vector (days). x (ndarray): Flux vector. P (list): List of periods to remove. smooth_factor (float, optional): Factor of phase to use as smooth width. Returns: Filter flux vector that can be removed from timeseries. Note: Does not require time to be sorted. Can handle NaN in flux vector. """ # Prepare arrays: Plist = np.atleast_1d(Plist) # Hack to handle 0-dim input Np = len(Plist) Nt = len(t) phase = zeros((Np,Nt), dtype='float64') indx = zeros((Np,Nt), dtype='int') indx_inv = zeros((Np,Nt), dtype='int') phase_tot = zeros(Nt, dtype='float64') phase_smooth_t = zeros((Np,Nt), dtype='float64') dphase = zeros(Np, dtype='float64') # Loop through periods to be removed: for k in range(Np): # Calculate the phase and sort it: phase[k] = mod(t, Plist[k]) indx[k] = argsort(phase[k]) indx_inv[k] = argsort(indx[k]) dphase[k] = median(diff( phase[k,indx[k]] )) # Calculate smooth version of the phase curve: phase_smooth = _filter_single_phase(phase[k,indx[k]], x[indx[k]]-phase_tot[indx[k]], Plist[k]/smooth_factor, dphase[k]) # Un-sort phase_smoooth back to time-sorted order: phase_smooth_t[k] = phase_smooth[indx_inv[k]] # Add to the total phase filter: phase_tot += phase_smooth_t[k,:] # If removing multiple periods perform iterative procedure where # phase curves are added and removed to avoid cross-talk between periods: if k != 0: for j in range(k): # Add the transit back into to the timeseries (by subtracting it from the filter): phase_tot -= phase_smooth_t[j,:] # Re-calculate the phase curve of the transit: phase_smooth = _filter_single_phase(phase[j,indx[j]], x[indx[j]]-phase_tot[indx[j]], Plist[j]/smooth_factor, dphase[j]) phase_smooth_t[j] = phase_smooth[indx_inv[j]] # Remove the transit again: phase_tot += phase_smooth_t[j,:] # Make plots of phase curves: if not _output_folder is None: # Find the point on the smoothed curve that deviates the most from zero: imax = nanargmax(np.abs(phase_smooth_t), axis=1) s = nanstd(x) fig = plt.figure() fig.canvas.set_window_title('phasecurve') fig.subplots_adjust(hspace=0.05) for k,P in enumerate(Plist): # Plot phasecurve for this period: ax = plt.subplot(Np, 1, k+1) ax.plot(phase[k]/P, x, 'k.', markersize=2) # No need to sort if we only plot points ax.plot(phase[k,indx[k]]/P, phase_smooth_t[k,indx[k]], 'r-') ax.axvline(phase[k,imax[k]]/P, color='b', linestyle='--') # Line indicating the (likely) planet transit ax.set_xlim(0, 1) ax.set_ylim(-6*s, 6*s) ax.text(0.02, 0.97, 'P = %f d'%(P), horizontalalignment='left', verticalalignment='top', transform=ax.transAxes, backgroundcolor='w', color='k') if k!=Np-1: plt.setp(ax.get_xticklabels(), visible=False) ax.set_xlabel('Phase') fig.text(0.03, 0.5, u'Flux (counts/s)', ha='center', va='center', rotation='vertical', transform=fig.transFigure) if _output_format != 'native': fig.savefig(os.path.join(_output_folder, _output_prefix+'phasecurve.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig) # Return the total time-sorted phase curve: return phase_tot
def extract_star_movement_1d(time, flux, position, dt=None, rapid_movement_sigma_clip=5.0, pixel_off_clip=15.0): """Extract information about star movement on CCD to be used later. Args: time: Vector of timestamps. flux: Vector of flux values. position: Nx2 matrix of (x,y) positions of star on CCD. dt: Mean distance between points in the time vector. Returns: flags: Flags indicating bad data points. star_movement: Object containing info to be passed to @filter_position_1d. """ # Logger for printing messages: logger = logging.getLogger(__name__) # If dictionary is given, split it into the components: if isinstance(position, dict): position_breaks = np.atleast_1d(position['break']) position = np.atleast_2d(position['pixels']) else: position_breaks = np.array([], dtype='float64') # Check input: Ng = len(time) assert len(flux)==Ng, "TIME and FLUX should have the same number of elements." assert position.shape==(Ng,2), "TIME and POSITION should have the same number of elements." if dt is None: dt = median(diff(time)) # Since many of the routines used here can not handle NaN values, # we start by removing all NaN values from the input, but store their # location so they can be inserted again later on: indx_finite = np.all(isfinite(position), axis=1) if not any(indx_finite): raise Exception("No valid positions") # Check that all the chunks defined by the breaks actually contain data: position_breaks = np.sort(position_breaks) # Make sure it is sorted in time Estart = append(append(time[0], position_breaks), time[-1]+dt/2) tbreaks = np.array(time[0], dtype='float64') for chk in range(1, len(Estart)): chunk = (time >= Estart[chk-1]) & (time < Estart[chk]) if any(indx_finite & chunk): tbreaks = append(tbreaks, Estart[chk]) Nchunks = len(np.atleast_1d(tbreaks)) if Nchunks < 2: tbreaks = np.array([time[0], time[-1]+dt/2]) Nchunks = 2 # Plot the position of the star as a function of time: fig = plt.figure() fig.canvas.set_window_title('Pixel positions vs time') fig.subplots_adjust(hspace=0.05) ax1 = fig.add_subplot(211) #ax1.scatter(time, position[:,0], color='b', s=1) plt.yticks(fontsize=10) ax2 = fig.add_subplot(212, sharex=ax1) #ax2.scatter(time, position[:,1], color='r', s=1) for tbreak in tbreaks: ax1.axvline(tbreak, color='k', linestyle='--') ax2.axvline(tbreak, color='k', linestyle='--') ax1.set_ylabel('Row (pixels)', fontsize=10) ax2.set_ylabel('Column (pixels)', fontsize=10) ax2.set_xlabel('Time (days)', fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) plt.setp(ax1.get_xticklabels(), visible=False) ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) # Initiate the Principle Component Analysis: pca = PCA(n_components=2) # Initiate the NearestNeighbors algorithm: NN = NearestNeighbors(n_neighbors=2, algorithm='kd_tree') # Prepare the plot which will be filled in the loop below: fig1 = plt.figure(figsize=(1.7*8,6)) fig1.canvas.set_window_title('Pixel positions') fig1ax1 = fig1.add_subplot(121) #fig1ax1.scatter(position[:,0], position[:,1], color='k', s=1) fig1ax1.set_xlabel('$x$ (pixels)', fontsize=10) fig1ax1.set_ylabel('$y$ (pixels)', fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig1ax1.axis('equal') fig1ax2 = fig1.add_subplot(122) fig1ax2.set_xlabel(r'$x^\prime$ (pixels)', fontsize=10) fig1ax2.set_ylabel(r'$y^\prime$ (pixels)', fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig1ax2.axis('equal') fig2 = plt.figure() fig2.canvas.set_window_title('Position changes') fig2.subplots_adjust(hspace=0.05) fig2ax1 = fig2.add_subplot(211) fig2ax1.set_xlim(time[0], time[-1]) fig2ax1.set_ylabel('$ds/dt$', fontsize=10) fig2ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) plt.yticks(fontsize=10) plt.setp(fig2ax1.get_xticklabels(), visible=False) fig2ax2 = fig2.add_subplot(212) fig2ax2.scatter(time, flux, color='k', s=1, alpha=0.5) fig2ax2.set_xlim(time[0], time[-1]) fig2ax2.set_ylabel('Flux', fontsize=10) fig2ax2.set_xlabel('Time (days)', fontsize=10) fig2ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig3 = plt.figure() fig3.canvas.set_window_title('Position sigma clipping') fig3ax1 = fig3.add_subplot(211) fig3ax1.set_xlabel('Time (days)', fontsize=10) fig3ax1.set_ylabel('Nearest neighbor distance (pixels)', fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig3ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False)) fig3ax1.set_xlim(time[0], time[-1]) fig3ax2 = fig3.add_subplot(212) fig3ax2.set_xlabel('$x$ (pixels)', fontsize=10) fig3ax2.set_ylabel('$y$ (pixels)', fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig3ax2.axis('equal') fig4 = plt.figure() fig4.canvas.set_window_title('Position PCA axis') flag_bad_pos = zeros(Ng, dtype='bool') curvelength = [] all_indx_possort = [] all_indx_timesort = [] all_chunks = [] for chk in range(Nchunks-1): # Cut out positions that are in this chunk and are valid: chunk = (time >= tbreaks[chk]) & (time < tbreaks[chk+1]) Nc = int(sum(chunk)) indx_chunk_finite = indx_finite & chunk # indicies that are in chunk and finite pos = position[indx_chunk_finite, :] t = time[indx_chunk_finite] # Detect points where position changes rapidly (position tweek): # Calculate 2D distance between points and the derivative with respect to time: ds = nansum(diff(pos, axis=0)**2, axis=1) dsdt = np.sqrt(ds) / diff(t) dsdt = append(dsdt[0], dsdt) m = median(dsdt) absdsdt = np.abs(dsdt - m) rapid_threshold = rapid_movement_sigma_clip * mad_to_sigma * median(absdsdt) # Find the points where the position is changing rapidly: indx_rapid = absdsdt > rapid_threshold # Find points where x or y pixel position is far from median x_pos = pos[:,0] x_pos_med = np.nanmedian(x_pos) y_pos = pos[:,1] y_pos_med = np.nanmedian(y_pos) indx_pos_off_x = (np.abs(x_pos-x_pos_med)>pixel_off_clip) indx_pos_off_y = (np.abs(y_pos-y_pos_med)>pixel_off_clip) indx_pos_off = indx_pos_off_x + indx_pos_off_y indx_bad = indx_pos_off + indx_rapid flag_bad_pos[indx_chunk_finite] = indx_bad # Add to plot: ax1.scatter(t[~indx_bad], x_pos[~indx_bad], color='b', s=1) ax2.scatter(t[~indx_bad], y_pos[~indx_bad], color='r', s=1) fig2ax1.scatter(t[~indx_bad], dsdt[~indx_bad], color='k', s=1, alpha=0.5) fig2ax1.plot([t[0], t[-1]], [m+rapid_threshold, m+rapid_threshold], 'r--') fig2ax1.plot([t[0], t[-1]], [m-rapid_threshold, m-rapid_threshold], 'r--') # Remove points from the position vector used in the following: #pos_nn_pca = pos[~indx_pos_off, :] #t_nn_pca = t[~indx_pos_off] #print(len(pos_nn_pca), len(t_nn_pca)) # Use nearest neighbour search to find distances between positions that are larger than the norm: distances, indices = NN.fit(pos).kneighbors(pos) #distances, indices = NN.fit(pos_nn_pca).kneighbors(pos_nn_pca) ndist = distances[:, 1] ndist -= median(ndist) distance_threshold = 4*mad_to_sigma*median(np.abs(ndist)) indx_good = ndist < distance_threshold indx_good[indx_pos_off] = False fig3ax1.scatter(t[indx_good], ndist[indx_good], color='k', s=1, alpha=0.5) fig3ax1.scatter(t[~indx_good], ndist[~indx_good], color='r', s=2) fig3ax1.plot([t[0], t[-1]], [distance_threshold, distance_threshold], 'r--') fig3ax2.scatter(pos[indx_good,0], pos[indx_good,1], color='k', s=1, alpha=0.5) # Use Principal component Analysis to rotate positions pca.fit(pos[indx_good, :]) # Apply the PCA model to the valid positions: pos2 = pca.transform(pos) # Create smooth curve along the movement: # TODO: This is sorting one time more than should be nessacery. cl_per05, cl_per95 = np.nanpercentile(pos2[:,0], [5, 95]) lowess_frac = 0.1/(cl_per95 - cl_per05) logger.debug("LOWESS Fraction=%f, (%f, %f)", lowess_frac, cl_per05, cl_per95) poscurve1 = lowess(pos2[:,1], pos2[:,0], frac=lowess_frac, it=3, is_sorted=False, return_sorted=False, missing='drop') poscurve1 = np.column_stack((pos2[:,0], poscurve1)) chi2_1 = nansum((pos2[:,1]-poscurve1[:,1])**2) cl_per05, cl_per95 = np.nanpercentile(pos2[:,1], [5, 95]) lowess_frac = 0.1/(cl_per95 - cl_per05) logger.debug("LOWESS Fraction=%f, (%f, %f)", lowess_frac, cl_per05, cl_per95) poscurve2 = lowess(pos2[:,0], pos2[:,1], frac=lowess_frac, it=3, is_sorted=False, return_sorted=False, missing='drop') poscurve2 = np.column_stack((pos2[:,1], poscurve2)) chi2_2 = nansum((pos2[:,0]-poscurve2[:,1])**2) # Plot the fig4ax1 = fig4.add_subplot(Nchunks-1, 2, 2*chk+1) fig4ax1.scatter(pos2[indx_good,0], pos2[indx_good,1], color='k', s=1, alpha=0.3) fig4ax1.scatter(poscurve1[indx_good,0], poscurve1[indx_good,1], color='r', s=2) fig4ax1.set_title(r"$\chi^2 = %f$" % chi2_1, fontsize=12) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig4ax1.axis('equal') fig4ax2 = fig4.add_subplot(Nchunks-1, 2, 2*chk+2) fig4ax2.scatter(pos2[indx_good,1], pos2[indx_good,0], color='k', s=1, alpha=0.3) fig4ax2.scatter(poscurve2[indx_good,0], poscurve2[indx_good,1], color='g', s=2) fig4ax2.set_title(r"$\chi^2 = %f$" % chi2_2, fontsize=12) plt.xticks(fontsize=10) plt.yticks(fontsize=10) fig4ax2.axis('equal') pos3 = empty(Nc, dtype='float64'); pos3.fill(NaN) if chi2_1 <= chi2_2: # Sort everything along the principle axis: pos3[indx_finite[chunk]] = pos2[:,0] poscurve = poscurve1[argsort(pos2[:,0]), :] indx_good2 = indx_good[argsort(pos2[:,0])] else: # Sort everything along the principle axis: pos3[indx_finite[chunk]] = pos2[:,1] poscurve = poscurve2[argsort(pos2[:,1]), :] indx_good2 = indx_good[argsort(pos2[:,1])] #print('chis', chi2_1, chi2_2) #print('Fin poscurve', poscurve[0:10,:]) # Create version of position curve in original pixel-space: poscurve_pixels = pca.inverse_transform(poscurve) # Make vectors that will sort this chunk accouding to position and time: indx_possort = argsort(pos3) indx_timesort = argsort(indx_possort) # Calculate length along curve for each timestamp: dx = diff(poscurve[:,0]) dy = diff(poscurve[:,1]) ds = np.sqrt(dx**2 + dy**2) # Length of each segment cl = empty(Nc, dtype='float64'); cl.fill(NaN) cl[indx_finite[chunk][indx_possort]] = append(0, np.cumsum(ds)) # length of curve at each knot # Gather vectors that will sort this chunk acording to position and time: all_indx_possort.append(indx_possort) all_indx_timesort.append(indx_timesort) all_chunks.append(chunk) curvelength.append(cl) # Add to plots: fig1ax1.scatter(x_pos[indx_good], y_pos[indx_good], color='k', s=1, alpha=0.5) fig1ax1.plot(poscurve_pixels[indx_good2,0], poscurve_pixels[indx_good2,1], 'r-') fig1ax2.scatter(pos2[indx_good,0]+pca.mean_[0], pos2[indx_good,1]+pca.mean_[1], color='k', s=1) fig1ax2.plot(poscurve[indx_good2,0]+pca.mean_[0], poscurve[indx_good2,1]+pca.mean_[1], 'r-') # Add the bad points to figure 2 lower panel: fig2ax2.scatter(time[flag_bad_pos], flux[flag_bad_pos], color='r', s=2) if _output_format != 'native': fig.savefig(os.path.join(_output_folder, _output_prefix+'pixel_time.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig) fig1.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig1) fig2.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_changes.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig2) fig3.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_sigma_clipping.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig3) fig4.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_pca.'+_output_format), format=_output_format, bbox_inches='tight') plt.close(fig4) # Gather the output needed by other functions into dictionary # that will be passed around between functions: star_movement = { 'curvelength': curvelength, 'indx_possort': all_indx_possort, 'indx_timesort': all_indx_timesort, 'tbreaks': tbreaks, 'Nchunks': Nchunks, 'chunks': all_chunks } return flag_bad_pos, star_movement
def getFeaturesFromVectors_pair(vectorData, seg1, seg2, segData): X = myX() segDilated = segData.segDilated segBoundSize = segData.segBoundSize seggSizes = segData.segSizes EdgeMapList = vectorData.EdgeMapList vectors = vectorData.segVectors clustersL2 = vectorData.segClustersL2 ratios = vectorData.ratios regFeatures = [] boundLine = np.logical_and(segDilated[seg1], segDilated[seg2]) for e in EdgeMapList: bound = e[boundLine] bound = bound[bound > 0] regFeatures.append(np.mean(bound)) #Feature mean bound dist. boundOverlap1 = bound.size / segBoundSize[seg1] boundOverlap2 = bound.size / segBoundSize[seg2] regFeatures.append(max(boundOverlap1, boundOverlap2)) #Feature max overlap with bound regFeatures.append(bound.size) #Feature bound size size1 = seggSizes[seg1] size2 = seggSizes[seg2] regFeatures.append(size1 + size2) #Feature New seg area X.regFeatures = regFeatures for ratioIdx, ratio in enumerate(ratios): cnnFeatures = [] for numLayer in range(0, len(vectors[seg1][ratioIdx])): layerFeatures = [] pair_dist = scipy.spatial.distance.cdist( vectors[seg1][ratioIdx][numLayer], vectors[seg2][ratioIdx][numLayer]) layerFeatures.append( np.min(pair_dist)) #Feature L2 min dist in vec rep. layerFeatures.append( np.max(pair_dist)) #Feature L2 max dist in vec rep. layerFeatures.append( np.mean(pair_dist)) #Feature L2 average dist in vec rep. layerFeatures.append( bn.median(pair_dist)) #Feature L2 median dist in vec rep. layerFeatures.append( np.sqrt( np.sum((clustersL2[seg1][ratioIdx][numLayer] - clustersL2[seg2][ratioIdx][numLayer] )**2))) #Feature L2 dist between L2 clusters pair_dist = scipy.spatial.distance.cdist( vectors[seg1][ratioIdx][numLayer], vectors[seg2][ratioIdx][numLayer], metric='cosine') layerFeatures.append( np.min(pair_dist)) # Feature cosine min dist in vec rep. layerFeatures.append( np.max(pair_dist)) # Feature cosine max dist in vec rep. layerFeatures.append( np.mean(pair_dist)) # Feature cosine average dist in vec rep. layerFeatures.append( bn.median(pair_dist)) # Feature cosine median dist in vec rep. layerFeatures.append( cosine_distances( np.array([clustersL2[seg1][ratioIdx][numLayer]]), np.array([ clustersL2[seg2][ratioIdx][numLayer] ]))[0][0]) #Feature cosine dist between L2 clusters cnnFeatures.append(layerFeatures) X.cnnFeatures[ratio] = cnnFeatures ImageFeatures = [] ImageFeatures.append( np.sqrt((vectorData.segL[seg1] - vectorData.segL[seg2])**2)) #Feature L channel dist ImageFeatures.append( np.sqrt((vectorData.segA[seg1] - vectorData.segA[seg2])**2)) #Feature A channel dist ImageFeatures.append( np.sqrt((vectorData.segB[seg1] - vectorData.segB[seg2])**2)) #Feature B channel dist X.ImageFeatures = ImageFeatures return X
def mad(datin, z=7, deriv=0, nozero=False): """ Median absolute deviation test, either on raw values, 1st or 2nd derivatives. Returns mask with false everywhere except where <(median-MAD*z/0.6745) or >(md+MAD*z/0.6745). Definition ---------- def mad(datin, z=7, deriv=0, nozero=False): Input ----- datin array; mad acts on axis=0 Optional Input -------------- z Input is allowed to deviate maximum z standard deviations from the median (default: 7) deriv 0: Act on raw input; 1: Use first derivatives; 2: Use 2nd derivatives nozero If True: exclude 0. from input Output ------ mask with false everywhere except where input deviates more than z standard deviations from median Restrictions ------------ If input is an array then it mad checks along the zeroth axis for outlier. 1st derivative is d = datin[1:n]-datin[0:n-1] because mean of left and right would give 0 for spikes. If all(d.mask==True) then return d.mask, which is all True Examples -------- >>> import numpy as np >>> y = np.array([-0.25,0.68,0.94,1.15,2.26,2.35,2.37,2.40,2.47,2.54,2.62, ... 2.64,2.90,2.92,2.92,2.93,3.21,3.26,3.30,3.59,3.68,4.30, ... 4.64,5.34,5.42,8.01],dtype=np.float) # Normal MAD >>> print(mad(y)) [False False False False False False False False False False False False False False False False False False False False False False False False False False] >>> print(mad(y,z=4)) [False False False False False False False False False False False False False False False False False False False False False False False False False True] >>> print(mad(y,z=3)) [ True False False False False False False False False False False False False False False False False False False False False False False False True True] # MAD on 2nd derivatives >>> print(mad(y,z=4,deriv=2)) [False False False False False False False False False False False False False False False False False False False False False False False True] # direct usage >>> my = np.ma.array(y, mask=mad(y,z=4)) >>> print(my) [-0.25 0.68 0.94 1.15 2.26 2.35 2.37 2.4 2.47 2.54 2.62 2.64 2.9 2.92 2.92 2.93 3.21 3.26 3.3 3.59 3.68 4.3 4.64 5.34 5.42 --] # MAD on several dimensions >>> yy = np.transpose(np.array([y,y])) >>> print(np.transpose(mad(yy,z=4))) [[False False False False False False False False False False False False False False False False False False False False False False False False False True] [False False False False False False False False False False False False False False False False False False False False False False False False False True]] >>> yyy = np.transpose(np.array([y,y,y])) >>> print(np.transpose(mad(yyy,z=3))) [[ True False False False False False False False False False False False False False False False False False False False False False False False True True] [ True False False False False False False False False False False False False False False False False False False False False False False False True True] [ True False False False False False False False False False False False False False False False False False False False False False False False True True]] # Masked arrays >>> my = np.ma.array(y, mask=np.zeros(y.shape)) >>> my.mask[-1] = True >>> print(mad(my,z=4)) [True False False False False False False False False False False False False False False False False False False False False False False False False --] >>> print(mad(my,z=3)) [True False False False False False False False False False False False False False False False False False False False False False False True True --] # Arrays with NaNs >>> ny = y.copy() >>> ny[-1] = np.nan >>> print(mad(ny,z=4)) [ True False False False False False False False False False False False False False False False False False False False False False False False False False] >>> print(mad(ny,z=3)) [ True False False False False False False False False False False False False False False False False False False False False False False True True False] # Exclude zeros >>> zy = y.copy() >>> zy[1] = 0. >>> print(mad(zy,z=3)) [ True True False False False False False False False False False False False False False False False False False False False False False False True True] >>> print(mad(zy,z=3,nozero=True)) [ True False False False False False False False False False False False False False False False False False False False False False False False True True] License ------- This file is part of the JAMS Python package, distributed under the MIT License. The JAMS Python package originates from the former UFZ Python library, Department of Computational Hydrosystems, Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany. Copyright (c) 2011-2013 Matthias Cuntz - mc (at) macu (dot) de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. History ------- Written, MC, Nov 2011 Modified, MC, May 2012 - act on axis=0 of array MC, Jun 2012 - axis=0 did not always work: spread md and MAD to input dimensions MC, Jun 2012 - use np.diff, remove spreads MC, Feb 2013 - ported to Python 3 MC & JM, Jul 2013 - loop over second dimension for medians, faster than array calculations :-( but use bottleneck for speed :-) MC, Jul 2013 - (re-)allow masked arrays and NaNs in arrays MC, Oct 2013 - nozero, bug in NaN treatment with dim=1 """ if nozero: idatin = datin.copy() ii = np.where(idatin == 0.)[0] if ii.size > 0: idatin[ii] = np.nan else: idatin = datin sn = list(np.shape(idatin)) n = sn[0] if deriv == 0: m = n d = idatin elif deriv == 1: m = n-1 sm = sn sm[0] = m d = np.diff(idatin, axis=0) elif deriv == 2: m = n-2 sm = sn sm[0] = m d = np.diff(idatin, n=2, axis=0) else: raise ValueError('Unimplemented option.') # Shortcut if all masked ismasked = type(d) == np.ma.core.MaskedArray if not ismasked: ii = np.where(~np.isfinite(d))[0] d = np.ma.array(d) if ii.size > 0: d[ii] = np.ma.masked if np.all(d.mask == True): if ismasked: return d.mask else: return np.ones(d.shape, dtype=np.bool) # Median oldsettings = np.geterr() np.seterr(invalid='ignore') if d.ndim == 1: try: import bottleneck as bn dd = d.compressed() md = bn.median(dd) # Median absolute deviation MAD = bn.median(np.abs(dd-md)) # Range around median thresh = MAD * (z/0.6745) # True where outside z-range res = (d<(md-thresh)) | (d>(md+thresh)) except: dd = d.compressed() md = np.median(dd) # Median absolute deviation MAD = np.median(np.abs(dd-md)) # Range around median thresh = MAD * (z/0.6745) # True where outside z-range res = (d<(md-thresh)) | (d>(md+thresh)) elif d.ndim == 2: try: import bottleneck as bn res = np.empty(d.shape, dtype=np.bool) for i in range(d.shape[1]): di = d[:,i] dd = di.compressed() md = bn.median(dd) # Median absolute deviation MAD = bn.median(np.abs(dd-md)) # Range around median thresh = MAD * (z/0.6745) # True where outside z-range res[:,i] = (d[:,i]<(md-thresh)) | (d[:,i]>(md+thresh)) except: res = np.empty(d.shape, dtype=np.bool) for i in range(d.shape[1]): di = d[:,i] dd = di.compressed() md = np.median(dd) # Median absolute deviation MAD = np.median(np.abs(dd-md)) # Range around median thresh = MAD * (z/0.6745) # True where outside z-range res[:,i] = (d[:,i]<(md-thresh)) | (d[:,i]>(md+thresh)) else: np.seterr(**oldsettings) raise ValueError('datin.ndim must be <= 2') np.seterr(**oldsettings) if ismasked: return res else: resmasked = type(res) == np.ma.core.MaskedArray if resmasked: # got masked because of NaNs return np.where(res.mask, False, res) else: return res
import timeit setup = ''' import numpy as np import bottleneck as bn from scipy.stats import rankdata np.random.seed(42) a = np.random.randn(30) ''' def time(code, setup, n): return timeit.Timer(code, setup=setup).repeat(3, n) if __name__ == '__main__': n = 10**3 print n, "pass", max(time("pass", "", n)) print n, "min np.median", min(time('np.median(a)', setup, n)) print n, "min bn.median", min(time('bn.median(a)', setup, n)) a = np.arange(7) print "Median diff", np.median(a) - bn.median(a) func, _ = bn.func.median_selector(a, axis=0) print "Bottleneck median func name", func print n, "min scipy.stats.rankdata", min(time('rankdata(a)', setup, n)) print n, "min bn.rankdata", min(time('bn.rankdata(a)', setup, n)) func, _ = bn.func.rankdata_selector(a, axis=0) print "Bottleneck rankdata func name", func
def main(Data,GC_hist,args): #compute the scaled coverage print("finished reading the coverage data") bin_size=Data["bin_size"] args.min_bins=int(args.nbins/2) if not args.min_bins: print "Error: the minimum variant size is smaller than the bin sie of the input data!" quit() for chromosome in Data["chromosomes"]: Data[chromosome]["ratio"]=[] for i in range(0,len(Data[chromosome]["coverage"])): if not Data[chromosome]["GC"][i] in GC_hist: Data[chromosome]["ratio"].append(-1) elif GC_hist[Data[chromosome]["GC"][i]][0] > 0 and not Data[chromosome]["GC"][i]== -1: if Data[chromosome]["coverage"][i]/GC_hist[Data[chromosome]["GC"][i]][0] < args.max: Data[chromosome]["ratio"].append(Data[chromosome]["coverage"][i]/GC_hist[Data[chromosome]["GC"][i]][0]) else: Data[chromosome]["ratio"].append(-1) else: Data[chromosome]["ratio"].append(-1) Data[chromosome]["ratio"]=numpy.array(Data[chromosome]["ratio"]) Data=calibrate_sex(Data) #filter the bins print("applying filters") Data=filter(Data,args.nbins*2) print("computing coverage histogram") ratio_hist=chromosome_hist(Data,args.Q) hist=coverage_hist(Data,ratio_hist) percentiles=numpy.percentile(hist,numpy.array(range(0,1001))/10.0) overall_sd=numpy.std(hist[ numpy.where(hist <= 2) ]) print("derivative based segmentation") for chromosome in Data["chromosomes"]: Data[chromosome]["var"]=numpy.repeat( "NEUTRAL",len(Data[chromosome]["ratio"]) ); Data[chromosome]["ratio"]=numpy.array(Data[chromosome]["ratio"]) ratio_indexes=[] ratios=[] for i in range(1,len(Data[chromosome]["ratio"])): if Data[chromosome]["ratio"][i] >= 0: ratio_indexes.append(i) ratios.append(Data[chromosome]["ratio"][i]) differences=[] for i in range(1,args.nbins+1): tmp=[] for j in range(0,len(ratios)-args.nbins): tmp.append( abs(ratios[j]-ratios[i+j])) differences.append(tmp) differences=numpy.array(differences) change_points=[] #print len(ratios) lim=2*overall_sd #lim=0.2 for i in range(0,len(ratios)-args.nbins): changes=differences[:,i] #print "{} {}".format(lim,numpy.min(changes)) if bottleneck.median(changes,axis=0) > lim and numpy.std(changes[1:]) < overall_sd: #print "{} {}".format(lim,numpy.min(changes)) change_points.append(ratio_indexes[i]) segments=[] change_points.append( len( Data[chromosome]["ratio"] ) ) for i in range(0,len(change_points)): if i == 0: segments.append(range(0,change_points[i])) elif i != len(change_points)-1: segments.append(range(change_points[i-1],change_points[i])) else: segments.append(range(change_points[i-1],len(Data[chromosome]["ratio"]))) for segment in segments: segment_intensities= Data[chromosome]["ratio"][segment] non_filt_bins=segment_intensities[numpy.where(segment_intensities >= 0)] TYPE="NEUTRAL" med=bottleneck.median(non_filt_bins,axis=0) if len(non_filt_bins) < args.min_bins: TYPE="FILT" elif med <= 1-0.5/args.plody: TYPE="DEL" elif med >= 1+0.5/args.plody: TYPE="DUP" Data[chromosome]["var"][segment]=TYPE print("raw coverage segmentation") for chromosome in Data["chromosomes"]: for i in range(0,len(Data[chromosome]["ratio"])-10*args.nbins): seg_bins=Data[chromosome]["ratio"][i:i+10*args.nbins] if list(seg_bins).count(-1)/float(len(seg_bins)) >= 0.6: continue seg_bin_median=bottleneck.median(seg_bins[numpy.where(seg_bins >= 0)],axis=0) if seg_bin_median >= 1+overall_sd*2.5 and len(seg_bins[numpy.where(seg_bins >= 0.5/args.plody+1)])/float(len(seg_bins)) >= 0.9: Data[chromosome]["var"][i:i+10*args.nbins]="DUP" elif seg_bin_median <= 1-overall_sd*2.5 and len(seg_bins[numpy.where(seg_bins >= 1-0.5/args.plody)])/float(len(seg_bins)) >= 0.9: Data[chromosome]["var"][i:i+10*args.nbins]="DEL" print("merging") variants=segmentation(Data,args.min_bins) size_filtered_variants={} for chromosome in variants: for variant in variants[chromosome]: if variant["bins"] >= args.min_bins: if not chromosome in size_filtered_variants: size_filtered_variants[chromosome] = [] size_filtered_variants[chromosome].append(variant) variants=merge(size_filtered_variants,args.min_bins) CNV_filtered={} for chromosome in variants: for variant in variants[chromosome]: if variant["type"] == "DUP" or variant["type"] == "DEL": if not chromosome in CNV_filtered: CNV_filtered[chromosome] = [] CNV_filtered[chromosome].append(variant) #read the bam header args.contigs={} args.contig_order=[] if args.bam: with os.popen("samtools view -H {}".format(args.bam)) as pipe: for line in pipe: if line[0] == "@": if "SN:" in line: content=line.strip().split() chromosome=content[1].split("SN:")[-1] length=content[2].split("LN:")[-1] args.contigs[chromosome]=length args.contig_order.append(chromosome) elif "\tSM:" in line and not args.sample: args.sample=line.split("\tSM:")[-1].split("\t")[0].strip() #print the variants print("computing statistics") vals=[] counts={} n_variants=0 for chromosome in Data["chromosomes"]: if chromosome in variants: for variant in variants[chromosome]: if variant["type"] == "DUP" or variant["type"] == "DEL" or 1 == 2: phred_non_param=retrieve_phred_non_param(variant["bins"],variant["ratio"],Data,ratio_hist) if not phred_non_param in counts: vals.append(phred_non_param) counts[phred_non_param]=0 counts[phred_non_param]+=1 variant["pred_non_param"]=phred_non_param n_variants+=1 if n_variants: args.scoren+=round(10*math.log10(n_variants/1000.0)) else: args.scoren=1 f=open(args.output,"w") f.write("##fileformat=VCFv4.1\n") f.write("##source=AMYCNE\n") f.write("##ALT=<ID=DEL,Description=\"Deletion>\n") f.write("##ALT=<ID=DUP,Description=\"Duplication\">\n") f.write("##INFO=<ID=RDR,Number=1,Type=Float,Description=\"Average coverage/reference ratio\">\n") f.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"The end position of the variant\">\n") f.write("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"The length of the variant\">\n") f.write("##INFO=<ID=BINS,Number=1,Type=Integer,Description=\"The number of bins used to call the variant\">\n") f.write("##INFO=<ID=SCOREF,Number=1,Type=Integer,Description=\"The variant score produced from Fishers method\">\n") f.write("##INFO=<ID=SCOREN,Number=1,Type=Integer,Description=\"The variant score produced from non-parametric sampling method\">\n") f.write("##INFO=<ID=QUAL,Number=1,Type=Float,Description=\"The fraction of low quality bins\">\n") f.write("##INFO=<ID=FAILED_BINS,Number=1,Type=Float,Description=\"The fraction of filtered bins\">\n") f.write("##INFO=<ID=ratio,Number=1,Type=Float,Description=\"Normalised coverage across the chromosome\">\n") f.write("##INFO=<ID=ratioMAD,Number=1,Type=Float,Description=\"normalised Median absolute deviation across the chromosome\">\n") f.write("##INFO=<ID=coverage,Number=1,Type=Float,Description=\"Median coverage of the chromosome\">\n") f.write("##INFO=<ID=coverageMAD,Number=1,Type=Float,Description=\"Median absolute deviation of the coverage across the chromosome\">\n") if args.contig_order: for contig in args.contig_order: f.write("##contig=<ID={},length={}>\n".format(contig,args.contigs[contig])) f.write("##FILTER=<ID=LowBinQual,Description=\"More than 90% of the bins have less than {} mapping quality\">\n".format(args.Q)) f.write("##FILTER=<ID=RegionFilter,Description=\"More than 90% of the bins are flagged extremed GC and/or mapping quality\">\n") f.write("##FILTER=<ID=RatioFilter,Description=\"The RD ratio is less than 2 sd of the RD, or RDR higher than ratiolim\">\n") f.write("##FILTER=<ID=LowScore,Description=\"Low variant score\">\n") f.write("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">\n") f.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") f.write("##nbins={} RDstdev={} ScoreNLimit={}\n".format(args.nbins,overall_sd,args.scoren)) f.write("##AMYCNEcmd=\"{}\"\n".format(" ".join(sys.argv))) if not args.sample: args.sample=args.coverage.split("/")[-1].split(".")[0] f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(args.sample)) format_column="GT:CN" id_tag=0; for chromosome in Data["chromosomes"]: if chromosome in variants: for variant in variants[chromosome]: if variant["type"] == "DUP" or variant["type"] == "DEL" or 1 == 2: id_tag +=1 filt="PASS" info_field="END={};SVLEN={};RDR={};BINS={}".format(bin_size*variant["end"],(variant["end"]-variant["start"]+1)*bin_size,variant["ratio"],variant["bins"] ) CN=int(round(variant["ratio"]*args.plody)) if "quality" in Data[chromosome]: failed_bins=0 for i in range(variant["start"],variant["end"]): if Data[chromosome]["quality"][i] < args.Q and Data[chromosome]["GC"][i] > 0 and Data[chromosome]["ratio"][i] > 0: failed_bins += 1 if failed_bins/float(variant["end"]-variant["start"]) > 0.9: filt="LowBinQual" info_field +=";QUAL={}".format( failed_bins/float(variant["end"]-variant["start"]) ) phred=retrieve_phred(variant["ratio_list"],variant["ratio"],percentiles) phred_non_param=variant["pred_non_param"] info_field+=";SCOREF={};SCOREN={}".format(phred,phred_non_param) #info_field+=";SCOREF={}".format(phred) if phred < args.scoref or phred_non_param < args.scoren: filt="LowScore" failed_bins=0 for i in range(variant["start"],variant["end"]): if Data[chromosome]["ratio"][i] < 0: failed_bins += 1 if failed_bins/float(variant["end"]-variant["start"]) > 0.9: filt="RegionFilter" if abs(variant["ratio"]-1) <= overall_sd*2 or abs(variant["ratio"]) > args.ratioLim: filt="RatioFilter" info_field +=";FAILED_BINS={}".format( failed_bins/float(variant["end"]-variant["start"]) ) mean=numpy.average(variant["ratio_list"]) SEM=numpy.std(variant["ratio_list"])/numpy.sqrt( len(variant["ratio_list"]) ) ci="({},{})".format(round(mean-SEM*3,2),round(mean+SEM*3,2)) firstrow = "{}\t{}\tAMYCNE_{}\tN\t<{}>\t{}\t{}".format(chromosome,bin_size*variant["start"],id_tag,variant["type"],phred_non_param,filt) info_field+=";ratio={};ratioMAD={};coverage={};coverageMAD={}".format(ratio_hist[chromosome][0],ratio_hist[chromosome][1],ratio_hist[chromosome][2],ratio_hist[chromosome][3]) alt=abs((CN-args.plody)) if alt > args.plody: alt=args.plody ref=args.plody-alt genotype="/".join(["0"]*ref+["1"]*alt) format_field="{}\t{}:{}".format(format_column,genotype,CN) f.write("\t".join([firstrow,info_field,format_field])+"\n") f.close()
def time_median(self, dtype, shape): bn.median(self.arr)
def medestnoise(x): return bn.median(np.abs(x)) * complex_std_est_factor
def medestnoise(x): return bn.median(np.abs(x))*complex_std_est_factor
def freqextr(lightcurve, n_peaks=6, n_harmonics=0, hifac=1, ofac=4, snrlim=None, snr_width=None, faplim=1 - 0.9973, devlim=0.5, conseclim=10, harmonics_list=None, Noptimize=10, optim_max_diff=10, initps=None): r""" Extract frequencies from timeseries. The program will perform iterative sine-wave fitting (CLEAN or pre-whitening) using a sum of harmonic functions of the following form: .. math:: \sum_{i=1}^{N_\mathrm{peaks}} A_i \sin(2\pi\nu_i t + \delta_i) = \sum_{i=1}^{N_\mathrm{peaks}} \alpha_i\sin(2\pi\nu_i t) + \beta_i\cos(2\pi\nu_i t) \, , where :math:`\nu_i`, :math:`A_i` and :math:`\delta_i` denoted the frequency, amplitude and phase of the oscillation. If ``n_harmonic`` is greater than zero, the routine will additionally for each extracted peak extract peaks at the given number of harmonics for each peak. Default is to :math:`2\nu_i`, :math:`3\nu_i` etc., but this can be controlled by the ``harmonics_list`` input. At each iteration, an optimization loop is entered which will go back and re-optimize the previously found peaks in an attempt at minimizing influences between close frequencies. The details of this optimization can be controlled by the parameters ``Noptimize`` and ``optim_max_diff``. Parameters: lightcurve (:class:`lightkurve.LightCurve`): Lightcurve to extract frequencies for. n_peaks (int, optional): Number of frequencies to extract. n_harmonics (int, optional): Number of harmonics to extract for each frequency. hifac (float, optional): Nyquist factor. ofac (int, optional): Oversampling factor used for initial search for peaks in power spectrum. snrlim (float, optional): Limit on local signal-to-noise ratio above which peaks are considered significant. If set to `None` no limit is enforced. Default is to not enforce a limit. snr_width (float, optional): Width in uHz around each peak to estimate signal-to-noise from. Default is 15 frequency steps on either side of the peak. faplim (float, optional): False alarm probability limit. Peaks with a f.a.p. below this limit are considerd significant. If set to `None` no limit is enforced. Default is 1-0.9973=0.0027. devlim (float, optional): If set to `None` no limit is enforced. Default is 50%. conseclim (int, optional): Stop after this number of consecutive failed peaks. Default is 10. Noptimize (int, optional): At each iteration re-optimize. If put to -1, all peaks will be optimized at each iteration. Default is 10. optim_max_diff (float, optional): Maximal difference in uHz between frequencies to be optimized. Any frequencies futher away than this value from the extracted peak will not be optimized in that iteration. If set to ``None`` no limit is enforced. Default is 10 uHz. Please note that this does not take the spectral windowfunction into account, so this value may have to be increased in cases where the windowfunction has significant side-lobes. initps (:class:`powerspectrum`, optional): Initial powerspectrum. Should be a powerspectrum calculated from the provided lightcurve. This can be provided if the powerspectrum has already been calculated. If not provided, it is calculated from the provided lightcurve. Returns: :class:`astropy.table.Table`: Table of extracted oscillations. Note: If the height of the peak of one of the harmonics are close to being insignificant, the harmonic may not be found as an harmonic, but will be found later as a peak in it self. .. codeauthor:: Kristine Kousholt Mikkelsen <*****@*****.**> .. codeauthor:: Rasmus Handberg <*****@*****.**> """ logger = logging.getLogger(__name__) # Default value for different parameters # TODO: Add these as inputs estimate_noise = True if initps is not None and not isinstance(initps, powerspectrum): raise ValueError("Initial powerspectrum is invalid") if Noptimize is None: Noptimize = 0 # If no list of harmonics is given, do the simple one: if harmonics_list is None: harmonics_list = np.arange(2, n_harmonics + 2) elif len(harmonics_list) < n_harmonics: raise ValueError("List of harmonics is too short") # Constants: power_median_to_mean = (1 - 1 / 9)**-3 mean_noise = 1 # Store original lightcurve and powerspectrum for later use: original_lightcurve = lightcurve.copy() if initps is None: original_ps = powerspectrum(original_lightcurve) else: original_ps = initps f_max = original_ps.nyquist * hifac * 1e6 df = original_ps.df * 1e6 # Defaults that depend on the power spectrum parameters: if snr_width is None: snr_width = 15 * df # Create lists for frequencies, alpha, beta and deviations: # Create as 2D array, which as main-frequency number for rows, and harmonic number for columns. nu = np.full((n_peaks, n_harmonics + 1), np.nan) alpha = np.full((n_peaks, n_harmonics + 1), np.nan) beta = np.full((n_peaks, n_harmonics + 1), np.nan) deviation = np.full((n_peaks, n_harmonics + 1), np.nan) # The first powerspectrum has already been calculated: ps = original_ps.copy() for i in range(n_peaks): logger.debug("-" * 72) # Calculate the powerspectrum and find the index of the largest power value if i > 0: ps = powerspectrum(lightcurve) frequency, power = ps.powerspectrum(oversampling=ofac, nyquist_factor=hifac, scale='power') # Estimate a frequency-dependent noise-floor by binning the power spectrum. if estimate_noise: # Create bins to estimate noise level in: #bins = np.logspace(np.floor(np.log10(df)), np.ceil(np.log10(f_max)), 20) bins = np.linspace(df, f_max, 20) # Calculate the median in the bins. # Make sure we have at least 20 frequencies in each bin, # otherwise combine adjacent bins until this is the case: for _ in range(100): mean_noise, bins, binindx = binned_statistic(frequency, power, bins=bins, statistic=median) redo = False for k, num in enumerate(np.bincount(binindx)): if num < 20: bins = np.delete(bins, k) redo = True break if not redo: break bins = bins[:-1] + 0.5 * (bins[1:] - bins[:-1]) indx = np.isfinite(mean_noise) if np.sum(indx) > 2: mean_noise_func = interp1d(bins[indx], mean_noise[indx], kind='linear', fill_value='extrapolate', assume_sorted=True) mean_noise = power_median_to_mean * mean_noise_func(frequency) mean_noise = np.clip(mean_noise, 0, None) mean_noise += 1 # Add one to avoid DivideByZero errors - only used for finding max else: mean_noise = 1 #plt.figure() #plt.plot(frequency, np.sqrt(power/mean_noise), 'k-', lw=0.5) #plt.plot(frequency, power, 'b') #plt.plot(frequency, mean_noise,'k-') #plt.title(i) #plt.show() # Finds the frequency of the largest peak: pmax_index = np.argmax(power / mean_noise) fsearch = frequency[pmax_index] if pmax_index > 0 and pmax_index < len(power) - 1: fsearch = [ frequency[pmax_index - 1], fsearch, frequency[pmax_index + 1] ] nu[i, 0] = ps.optimize_peak(fsearch) alpha[i, 0], beta[i, 0] = ps.alpha_beta(nu[i, 0]) logger.debug('Fundamental frequency: %f', nu[i, 0]) # Stop if significance becomes too low (lombscargle significance) if faplim is not None: FAP = ps.false_alarm_probability(nu[i, 0]) if FAP > faplim: logger.debug("Stopped from FAP") nu[i, 0] = np.nan alpha[i, 0] = np.nan beta[i, 0] = np.nan break # Stop if significance becomes too low (SNR ratio) if snrlim is not None: # Calculate SNR by estimating noise level locally around peak: # TODO: Subtract peak first? noise = np.sqrt( power_median_to_mean * median(power[(frequency > (nu[i, 0] - snr_width)) & (frequency < (nu[i, 0] + snr_width))])) amp = np.sqrt(alpha[i, 0]**2 + beta[i, 0]**2) snr = amp / noise logger.debug("SNR: %f", snr) #plt.figure() #plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5) #plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5) #plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go') #plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude')[1], 'ro') #plt.axhline(noise) #plt.axvline(nu[i,0] - snr_width) #plt.axvline(nu[i,0] + snr_width) if snr < snrlim: logger.debug("Stopped from SNR") nu[i, 0] = np.nan alpha[i, 0] = np.nan beta[i, 0] = np.nan break # Check how the extracted peak compares with the original powerspectrum if devlim is not None: atemp, btemp = original_ps.alpha_beta(nu[i, 0]) deviation[i, 0] = (alpha[i, 0]**2 + beta[i, 0]**2) / (atemp**2 + btemp**2) # Stops if there are to many consecutive failed peaks if devlim is not None and conseclim is not None: # Stop numpy from warning us that deviation contains NaN with np.errstate(invalid='ignore'): deviation_large = (deviation > 1 / devlim) | (deviation < devlim) if np.all( deviation_large[max(i - conseclim, 0):(i + 1), 0]): # Only checking main peaks right now! logger.debug( 'Stopped due to too many consecutive failed peaks') break # Removes the largest peak from the data: lightcurve -= model(lightcurve.time, alpha[i, 0], beta[i, 0], nu[i, 0]) # Loop through all harmonics: for h in range(1, n_harmonics + 1): n_harmonic = harmonics_list[h - 1] # Don't find harmonics outside frequency range: if n_harmonic * nu[i, 0] > f_max: break # Updates the flux and optimize to find the correct frequency ps = powerspectrum(lightcurve) # Checks the significance of the harmonics. If it is too low NaN is returned in amplitude, frequency and phase for the given harmonic nu[i, h] = ps.optimize_peak(n_harmonic * nu[i, 0]) # Stop if significance becomes too low (lombscargle significance) if faplim is not None: FAP = ps.false_alarm_probability(nu[i, h]) logger.debug('harmonic %d: %f %f', h, nu[i, h], FAP) if FAP > faplim: logger.debug("Harmonic rejected from FAP") nu[i, h] = np.nan alpha[i, h] = np.nan beta[i, h] = np.nan continue # Stop if significance becomes too low (SNR ratio): if snrlim is not None: # Calculate SNR by estimating noise level locally around peak: # TODO: Subtract peak first? noise = np.sqrt( power_median_to_mean * median(power[(frequency > (nu[i, 0] - snr_width)) & (frequency < (nu[i, 0] + snr_width))])) amp = np.sqrt(alpha[i, 0]**2 + beta[i, 0]**2) snr = amp / noise logger.debug("SNR: %f", snr) #plt.figure() #plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5) #plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5) #plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go') #plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude')[1], 'ro') #plt.axhline(noise) #plt.axvline(nu[i,0] - snr_width) #plt.axvline(nu[i,0] + snr_width) if snr < snrlim: logger.debug("Stopped from SNR") nu[i, 0] = np.nan alpha[i, 0] = np.nan beta[i, 0] = np.nan break # Removes the harmonic peak from the data: alpha[i, h], beta[i, h] = ps.alpha_beta(nu[i, h]) lightcurve -= model(lightcurve.time, alpha[i, h], beta[i, h], nu[i, h]) # Check how the extracted peak compares with the original powerspectrum and stops # if there are to many consecutive failed peaks if devlim is not None: atemp, btemp = original_ps.alpha_beta(nu[i, h]) deviation[i, h] = (alpha[i, h]**2 + beta[i, h]**2) / (atemp**2 + btemp**2) # Optimize the Noptimize nearest peaks if i != 0 and Noptimize != 0: for h in range(n_harmonics + 1): # Sort to find nearest frequencies to optimize Nopt = Noptimize + 1 if (i + 1) * (n_harmonics + 1) < Nopt or Noptimize == -1: Nopt = (i + 1) * (n_harmonics + 1) nusort = np.abs(nu - nu[i, h]) nusort = nusort.ravel() order = np.argsort( nusort) # sort nusort and find the list of indexes # Create an index of which peaks should be optimized: indx_optim = np.zeros_like(order, dtype='bool') indx_optim[1:Nopt] = True # Only optimize a peak if it is closer than the set limit. # NOTE: Be careful as this doesn't take the window function into account if optim_max_diff is not None: with np.errstate(invalid='ignore'): indx_optim &= (nusort[order] < optim_max_diff) # Pick out the peaks that should be optimized: order = order[indx_optim] order = list( zip(*np.unravel_index(order, (n_peaks, n_harmonics + 1)))) logger.debug("Optimizing %d peaks", len(order)) for j in order: if np.isfinite( alpha[j] ): # and deviation[j] < 1/devlim and deviation[j] > devlim: # Add the oscillation: lightcurve += model(lightcurve.time, alpha[j], beta[j], nu[j]) ps = powerspectrum(lightcurve) # Find the frequency of maximum power and find alpha and beta again nu[j] = ps.optimize_peak(nu[j]) alpha[j], beta[j] = ps.alpha_beta(nu[j]) # Recalculate the deviation if devlim is not None: atemp, btemp = original_ps.alpha_beta(nu[j]) deviation[j] = (alpha[j]**2 + beta[j]**2) / (atemp**2 + btemp**2) # Remove the oscillation again: lightcurve -= model(lightcurve.time, alpha[j], beta[j], nu[j]) # Remove anything that in the end was marked with a large deviation: if devlim is not None: for i in range(n_peaks): if deviation[i, 0] > 1 / devlim or deviation[i, 0] < devlim: # If main peak is rejected, then also reject all harmonics nu[i, :] = np.nan alpha[i, :] = np.nan beta[i, :] = np.nan else: for j in range(1, n_harmonics + 1): if deviation[i, j] > 1 / devlim or deviation[i, j] < devlim: nu[i, j] = np.nan alpha[i, j] = np.nan beta[i, j] = np.nan # Calculate amplitude and phase from alpha and beta: amp = np.sqrt(alpha**2 + beta**2) phase = np.arctan2(beta, alpha) # Make sure the found peaks are ordered by the amplitude of the main peak: amp[np.isnan(amp)] = -np.inf indx = np.argsort(amp[:, 0])[::-1] nu = nu[indx, :] amp = amp[indx, :] phase = phase[indx, :] alpha = alpha[indx, :] beta = beta[indx, :] deviation = deviation[indx, :] amp[~np.isfinite(amp)] = np.nan # Gather into table: num, harmonic = np.meshgrid(range(1, n_peaks + 1), range(n_harmonics + 1)) tab = Table(data=[ num.flatten(order='F'), harmonic.flatten(order='F'), nu.flatten(), amp.flatten(), phase.flatten(), alpha.flatten(), beta.flatten(), deviation.flatten() ], names=[ 'num', 'harmonic', 'frequency', 'amplitude', 'phase', 'alpha', 'beta', 'deviation' ], dtype=[ 'int32', 'int32', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64' ]) # Add units to columns: tab['frequency'].unit = u.uHz tab['amplitude'].unit = lightcurve.flux_unit tab['phase'].unit = u.rad tab['alpha'].unit = lightcurve.flux_unit tab['beta'].unit = lightcurve.flux_unit # Add index to peak number and harmonic for easy lookup: # TODO: Use table indicies - Problem with Pickle #tab.add_index('num') # Add meta data to table on how the list was created: tab.meta['n_peaks'] = n_peaks tab.meta['n_harmonics'] = n_harmonics tab.meta['harmonics_list'] = harmonics_list tab.meta['hifac'] = hifac tab.meta['ofac'] = ofac tab.meta['snrlim'] = snrlim tab.meta['snr_width'] = snr_width * u.uHz tab.meta['faplim'] = faplim tab.meta['devlim'] = devlim tab.meta['conseclim'] = conseclim return tab
def segmentation(Data,minimum_bin): variants={} for chromosome in Data["chromosomes"]: start_pos=-1; end_pos=-1; variant_type=None past_variant_type=-1 for i in range(0,len(Data[chromosome]["var"])): variant_type=Data[chromosome]["var"][i] if past_variant_type == -1: start_pos=i end_pos = i+1 past_variant_type=variant_type elif past_variant_type == variant_type: end_pos +=1 else: if not chromosome in variants: variants[chromosome] = [] ratio_list=Data[chromosome]["ratio"][start_pos:end_pos+1] ratio_list=ratio_list[numpy.where(ratio_list >= 0)] variants[chromosome].append({"start":start_pos,"end":end_pos,"type":past_variant_type,"ratio":bottleneck.median(ratio_list),"bins":end_pos-start_pos,"ratio_list":list(ratio_list)}) ratio_list=[] past_variant_type=variant_type start_pos=i end_pos=start_pos+1 return(variants)
def time_median(self, dtype, shape, order, axis): bn.median(self.arr, axis=axis)
def random_epstein_exact_weird_correction(values_input, nb_to_keep): nb_to_remove = len(values_input) - nb_to_keep k = nb_to_remove values = values_input[:] S = set(range(len(values))) n = len(S) indice_from_which_to_start = len(values) Aleft = weighted_average_weird_correction(values) Aright = float('inf') #max([x[0] for x in values])#float('inf') while len(S) > 1: sampled = sample(S, 1)[0] vi = values[sampled][0] wi = values[sampled][1] X, Y, Z, E = COMPUTE_X_Y_Z_E(S, values, Aleft, Aright, vi, wi) while True: #print 'HeLLo' if len(Z) > 0: A = median([(vi - values[j][0]) / (wi - values[j][1]) for j in Z]) #A=[(vi-values[j][0])/(wi-values[j][1]) for j in Z][len(Z)/2] #print A #print [(vi-values[j][0])/(wi-values[j][1]) for j in Z],A l2 = partsort(([A * values[j][1] - values[j][0] for j in S]), len(S) - k)[:len(S) - k] F_A = -sum(l2) if F_A == 0: return A elif F_A > 0: Aleft = A else: Aright = A X, Y, Z = UPDATE_X_Y_Z(S, values, Aleft, Aright, vi, wi, X, Y, Z) if ((len(X) + len(E)) >= (len(S) - k)) and k > 0: nb_to_remove = min(len(E), len(X) + len(E) - (len(S) - k)) to_remove_E = set(sample(E, nb_to_remove)) S = S - to_remove_E E = E - to_remove_E S = S - Y #k=k-(len(Y)+nb_to_remove) k = k - (len(Y) + nb_to_remove) Y = set() elif (len(Y) + len(E)) >= k: nb_to_collapse = min(len(E), len(Y) + len(E) - k) values_to_collapse_E = set(sample(E, nb_to_collapse)) E = E - values_to_collapse_E values_to_collapse = values_to_collapse_E | X S = S - values_to_collapse collapsed_v = 0. collapsed_w = 1. for x in values_to_collapse: vx, wx = values[x] collapsed_v += vx collapsed_w += wx collapsed = (collapsed_v, collapsed_w) values.append(collapsed) X = {indice_from_which_to_start} S.add(indice_from_which_to_start) indice_from_which_to_start += 1 # if len(Z)<=len(S)/32: # break if len(Z) <= len(S) / 32.: break spop = S.pop() #print values[spop] return values[spop][0] / values[spop][1]
def analyze_datafile(file, sensitivity=1e-1): with open(file, 'r') as csvfile: reader = csv.reader(csvfile) median_diffs = None median_diffs_idx = [] lineno = 1 allfloats = [] for row in reader: floats = [] for i in range(len(row)): try: f = float(row[i]) floats.append(f) # only on first run if not median_diffs: median_diffs_idx.append(i) except ValueError: pass # most likely a header line if not floats: continue allfloats.append(floats) if not median_diffs: median_diffs = [[] for _ in range(len(floats))] logging.debug("Possible price indices: {}".format(median_diffs_idx)) if len(floats) != len(median_diffs): logging.error("Line {} in file {} is exceptional, skipped.".format(lineno, file)) continue fmedian = bn.median(floats) # logging.debug("[{}] median: {}".format(lineno, fmedian)) for i in range(0, len(floats)): diffnow = (floats[i] - fmedian) / fmedian median_diffs[i].append(diffnow) # logging.debug("[{}][{}] diff: {}".format(lineno, i, diffnow)) lineno += 1 # we don't need more data than this (faster runtime) if lineno == 10001: break mean_median_diffs = np.abs(np.mean(median_diffs, axis=1)) logging.debug("Mean median diffs: {}".format(mean_median_diffs)) median_diffs_idx = np.array(median_diffs_idx) mean_median_diffs = np.array(mean_median_diffs) pricecols = median_diffs_idx[mean_median_diffs < sensitivity] numdecimals = 0 allfloats = np.asarray(allfloats) for f in allfloats.flat: splitted = str(f).split(".") if len(splitted) == 2: numdecimals = max(numdecimals, len(splitted[1])) # maximum number of decimals is limited numdecimals = min(numdecimals, 10) return pricecols, numdecimals
import bottleneck as bn import numpy as np import timeit setup = ''' import numpy as np import bottleneck as bn from scipy.stats import rankdata np.random.seed(42) a = np.random.randn(30) ''' def time(code, setup, n): return timeit.Timer(code, setup=setup).repeat(3, n) if __name__ == '__main__': n = 10**3 print n, "pass", max(time("pass", "", n)) print n, "min np.median", min(time('np.median(a)', setup, n)) print n, "min bn.median", min(time('bn.median(a)', setup, n)) a = np.arange(7) print "Median diff", np.median(a) - bn.median(a) func, _ = bn.func.median_selector(a, axis=0) print "Bottleneck median func name", func print n, "min scipy.stats.rankdata", min(time('rankdata(a)', setup, n)) print n, "min bn.rankdata", min(time('bn.rankdata(a)', setup, n)) func, _ = bn.func.rankdata_selector(a, axis=0) print "Bottleneck rankdata func name", func
def sequence_multi_probability_to_represent_element_mutil_probability(sequence_mutil_probability): represent_element_mutil_probability = bn.median(sequence_mutil_probability, axis=0) return represent_element_mutil_probability
def mad(arr): #copied from:https://stackoverflow.com/questions/8930370/where-can-i-find-mad-mean-absolute-deviation-in-scipy arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays. med = bottleneck.median(arr) return bottleneck.median(numpy.abs(arr - med))
def mad(datin, z=7, deriv=0, nozero=False): """ Median absolute deviation test, either on raw values, or on 1st or 2nd derivatives. Returns mask with False everywhere except where `<(median-MAD\*z/0.6745)` or `>(md+MAD\*z/0.6745)`. Parameters ---------- datin : array or masked array `mad` acts on `axis=0`. z : float, optional Input is allowed to deviate maximum `z` standard deviations from the median (default: 7) deriv : int, optional 0: Act on raw input (default). 1: Use first derivatives. 2: Use 2nd derivatives. nozero : bool, optional True: exclude zeros (0.) from input `datin`. Returns ------- array of bool False everywhere except where input deviates more than `z` standard deviations from median Notes ----- If input is an array then mad is checked along the zeroth axis for outlier. 1st derivative is calculated as `d = datin[1:n]-datin[0:n-1]` because mean of left and right would give 0 for spikes. If `all(d.mask==True)` then return `d.mask`, which is all True. Examples -------- >>> import numpy as np >>> y = np.array([-0.25,0.68,0.94,1.15,2.26,2.35,2.37,2.40,2.47,2.54,2.62, ... 2.64,2.90,2.92,2.92,2.93,3.21,3.26,3.30,3.59,3.68,4.30, ... 4.64,5.34,5.42,8.01],dtype=np.float) >>> # Normal MAD >>> print(mad(y)) [False False False False False False False False False False False False False False False False False False False False False False False False False False] >>> print(mad(y,z=4)) [False False False False False False False False False False False False False False False False False False False False False False False False False True] >>> print(mad(y,z=3)) [ True False False False False False False False False False False False False False False False False False False False False False False False True True] >>> # MAD on 2nd derivatives >>> print(mad(y,z=4,deriv=2)) [False False False False False False False False False False False False False False False False False False False False False False False True] >>> # direct usage >>> my = np.ma.array(y, mask=mad(y,z=4)) >>> print(my) [-0.25 0.68 0.94 1.15 2.26 2.35 2.37 2.4 2.47 2.54 2.62 2.64 2.9 2.92 2.92 2.93 3.21 3.26 3.3 3.59 3.68 4.3 4.64 5.34 5.42 --] >>> # MAD on several dimensions >>> yy = np.transpose(np.array([y,y])) >>> print(np.transpose(mad(yy,z=4))) [[False False False False False False False False False False False False False False False False False False False False False False False False False True] [False False False False False False False False False False False False False False False False False False False False False False False False False True]] >>> yyy = np.transpose(np.array([y,y,y])) >>> print(np.transpose(mad(yyy,z=3))) [[ True False False False False False False False False False False False False False False False False False False False False False False False True True] [ True False False False False False False False False False False False False False False False False False False False False False False False True True] [ True False False False False False False False False False False False False False False False False False False False False False False False True True]] >>> # Masked arrays >>> my = np.ma.array(y, mask=np.zeros(y.shape)) >>> my.mask[-1] = True >>> print(mad(my,z=4)) [True False False False False False False False False False False False False False False False False False False False False False False False False --] >>> print(mad(my,z=3)) [True False False False False False False False False False False False False False False False False False False False False False False True True --] >>> # Arrays with NaNs >>> ny = y.copy() >>> ny[-1] = np.nan >>> print(mad(ny,z=4)) [ True False False False False False False False False False False False False False False False False False False False False False False False False False] >>> print(mad(ny,z=3)) [ True False False False False False False False False False False False False False False False False False False False False False False True True False] >>> # Exclude zeros >>> zy = y.copy() >>> zy[1] = 0. >>> print(mad(zy,z=3)) [ True True False False False False False False False False False False False False False False False False False False False False False False True True] >>> print(mad(zy,z=3,nozero=True)) [ True False False False False False False False False False False False False False False False False False False False False False False False True True] History ------- Written, Matthias Cuntz, Nov 2011 Modified, Matthias Cuntz, May 2012 - act on axis=0 of array Matthias Cuntz, Jun 2012 - axis=0 did not always work: spread md and MAD to input dimensions Matthias Cuntz, Jun 2012 - use np.diff, remove spreads Matthias Cuntz, Feb 2013 - ported to Python 3 Matthias Cuntz & Juliane Mai Jul 2013 - loop over second dimension for medians, faster than array calculations :-( but use bottleneck for speed :-) Matthias Cuntz, Jul 2013 - (re-)allow masked arrays and NaNs in arrays Matthias Cuntz, Oct 2013 - nozero, bug in NaN treatment with dim=1 Matthias Cuntz, May 2020 - numpy docstring format """ if nozero: idatin = datin.copy() ii = np.where(idatin == 0.)[0] if ii.size > 0: idatin[ii] = np.nan else: idatin = datin sn = list(np.shape(idatin)) n = sn[0] if deriv == 0: m = n d = idatin elif deriv == 1: m = n - 1 sm = sn sm[0] = m d = np.diff(idatin, axis=0) elif deriv == 2: m = n - 2 sm = sn sm[0] = m d = np.diff(idatin, n=2, axis=0) else: raise ValueError('Unimplemented option.') # Shortcut if all masked ismasked = isinstance(d, np.ma.core.MaskedArray) if not ismasked: ii = np.where(~np.isfinite(d))[0] d = np.ma.array(d) if ii.size > 0: d[ii] = np.ma.masked if np.all(d.mask == True): if ismasked: return d.mask else: return np.ones(d.shape, dtype=np.bool) # Median oldsettings = np.geterr() np.seterr(invalid='ignore') if d.ndim == 1: try: import bottleneck as bn dd = d.compressed() md = bn.median(dd) # Median absolute deviation MAD = bn.median(np.abs(dd - md)) # Range around median thresh = MAD * (z / 0.6745) # True where outside z-range res = (d < (md - thresh)) | (d > (md + thresh)) except: dd = d.compressed() md = np.median(dd) # Median absolute deviation MAD = np.median(np.abs(dd - md)) # Range around median thresh = MAD * (z / 0.6745) # True where outside z-range res = (d < (md - thresh)) | (d > (md + thresh)) elif d.ndim == 2: try: import bottleneck as bn res = np.empty(d.shape, dtype=np.bool) for i in range(d.shape[1]): di = d[:, i] dd = di.compressed() md = bn.median(dd) # Median absolute deviation MAD = bn.median(np.abs(dd - md)) # Range around median thresh = MAD * (z / 0.6745) # True where outside z-range res[:, i] = (d[:, i] < (md - thresh)) | (d[:, i] > (md + thresh)) except: res = np.empty(d.shape, dtype=np.bool) for i in range(d.shape[1]): di = d[:, i] dd = di.compressed() md = np.median(dd) # Median absolute deviation MAD = np.median(np.abs(dd - md)) # Range around median thresh = MAD * (z / 0.6745) # True where outside z-range res[:, i] = (d[:, i] < (md - thresh)) | (d[:, i] > (md + thresh)) else: np.seterr(**oldsettings) raise ValueError('datin.ndim must be <= 2') np.seterr(**oldsettings) if ismasked: return res else: if isinstance(res, np.ma.core.MaskedArray): # got masked because of NaNs return np.where(res.mask, False, res) else: return res
def create_model_ima(model_dir, output_name, z, mu0, FOV, RA0, DEC0): # We read only a subset of particle properties (positions, to save memory) part = gizmo_read.read.Read.read_snapshot(species=('star', 'gas'), properties=['position'], directory=model_dir) # And save them to the stars and gas pandas dataframes # For indexing in pandas : stars.loc[x,y] stars = pd.DataFrame(np.array(part["star"]["position"])) gas = pd.DataFrame(np.array(part["gas"]["position"])) # We calculate the transformations for kpc to pix as a function of z (Distance) kpc_arcsec = cosmo.kpc_proper_per_arcmin(z) / 60. arcsec_pix = 0.1 # Euclid VIS pixscale axis_obs = np.array([0, 1]) # Transform from kpc to pix print('Transforming coordinates to pixel space') stars.loc[:, axis_obs[0]] = stars.loc[:, axis_obs[0]] / (kpc_arcsec * arcsec_pix) stars.loc[:, axis_obs[1]] = stars.loc[:, axis_obs[1]] / (kpc_arcsec * arcsec_pix) gas.loc[:, axis_obs[0]] = gas.loc[:, axis_obs[0]] / (kpc_arcsec * arcsec_pix) gas.loc[:, axis_obs[1]] = gas.loc[:, axis_obs[1]] / (kpc_arcsec * arcsec_pix) # We create the image print('Creating stellar image') image_stars = np.histogram2d(x=stars.loc[:, axis_obs[0]], y=stars.loc[:, axis_obs[1]], bins=FOV, range=[[-FOV / 2, FOV / 2], [-FOV / 2, FOV / 2]], normed=None, weights=None, density=None) print('Creating gas image') image_gas = np.histogram2d(x=gas.loc[:, axis_obs[0]], y=gas.loc[:, axis_obs[1]], bins=FOV, range=[[-FOV / 2, FOV / 2], [-FOV / 2, FOV / 2]], normed=None, weights=None, density=None) data = image_stars[0] + image_gas[0] # Convolve by the Euclid PSF psf = fits.open('/localdata/Borlaff/EMDB/kernel.fits') print('Convolving image with LARGE PSF') data_low = convolve_fft(data, psf[1].data, allow_huge=True) psf = fits.open('/localdata/Borlaff/EMDB/psf_VIS_centred.fits') print('Convolving image with Euclid VIS PSF') data[np.where(data == 1)] = 0 data_high = convolve_fft(data, psf[1].data, allow_huge=True) data = (data_low + data_high) / 2. # Photometry # What is the mean particle density on the central pixels? print('Calibrating photometry') skybg = bn.median(data[0:int(FOV / 10), 0:int(FOV / 10)]) data = data - skybg central_density = bn.median(data[int(FOV / 2 - 5):int(FOV / 2 + 5), int(FOV / 2 - 5):int(FOV / 2 + 5)]) int0 = (arcsec_pix**2) * 10**( (24.445 - mu0) / 2.5) # Central intensity for the mu0 set by the user photometry_correction = int0 / central_density data = data * photometry_correction # We add a fake centred WCS and save the fits file hdu = fits.PrimaryHDU(data=data) print('Saving fake WCS') hdu.header['WCSAXES'] = 2 hdu.header['CRPIX1'] = FOV / 2. + 0.5 hdu.header['CRPIX2'] = FOV / 2. + 0.5 hdu.header['CRVAL1'] = RA0 hdu.header['CRVAL2'] = DEC0 hdu.header['CTYPE1'] = 'RA---TAN' hdu.header['CTYPE2'] = 'DEC--TAN' hdu.header['RA'] = RA0 hdu.header['DEC'] = DEC0 hdu.header['CD1_1'] = 2.521185192875E-05 hdu.header['CD1_2'] = 1.173845066278E-05 hdu.header['CD2_1'] = 1.162545338166E-05 hdu.header['CD2_2'] = -2.537923352533E-05 print('Saving file: ' + output_name) if os.path.exists(output_name): os.remove(output_name) hdu.verify("silentfix") hdu.writeto(output_name)