Ejemplo n.º 1
0
def calc_noise_cal_factor(vlt, noise_slc, cal_slc, cal_temp, bandwidth):
    noise = vlt[:, noise_slc]
    cal = vlt[:, cal_slc]

    # assume noise is constant power over time window, robust MAD estimator
    noise_est = median(noise.real**2 + noise.imag**2)*med_pwr_est_factor
    # cal temp is constant, so use all cal measurements to estimate power
    cal_est = median(cal.real**2 + cal.imag**2)*med_pwr_est_factor
    pwr_factor = calc_power_factor(noise_est, cal_est, cal_temp, bandwidth)

    return noise_est, cal_est, pwr_factor
Ejemplo n.º 2
0
def theil_sen(x, y, sample= "auto", n_samples = 1e7):
    
    assert x.shape[0] == y.shape[0]
    
    n = x.shape[0]
    
    if n < 100 or not sample:
        ix = np.argsort( x )
        slopes = np.empty(int(n*(n-1)*0.5))
        for c, pair in enumerate(itertools.combinations(range(n), 2)):
            i,j = ix[pair[0]], ix[pair[1]]
            slopes[c] = slope(x[i], x[j], y[i], y[j])
    else:
        i1 = np.random.randint(int(0), int(n), int(n_samples))
        i2 = np.random.randint(int(0), int(n), int(n_samples))
        slopes = slope(x[i1], x[i2], y[i1], y[i2])

    slope_ = bottleneck.nanmedian(slopes)
    #find the optimal b as the median of y_i - slope*x_i
    intercepts = np.empty(n)
    for c in range(n):
        intercepts[c] = y[c] - slope_*x[c]
    intercept_ = bottleneck.median(intercepts)

    return np.array([slope_, intercept_])
Ejemplo n.º 3
0
def moving_nanmedian_cyclic(t, x, w, dt=None):
    """
	Calculate cyclic moving average of input with given window (in t-units)
	taking into account NaNs in the data.
	"""
    if len(t) != len(x):
        raise ValueError("t and x must have the same length.")
    if dt is None:
        dt = median(np.diff(t))
    # Calculate width of filter:
    width_points = int(w / dt)
    if width_points <= 1:
        return x
    if width_points % 2 == 0:
        width_points += 1  # Filter is much faster when using an odd number of points!
    wh = width_points // 2
    N = len(x)
    if wh >= N:
        return np.zeros_like(x) + nanmedian(x)
    # Stich ends onto the array:
    xny = np.concatenate((x[-wh - 1:N - 1], x, x[1:wh + 1]))
    # Run moving median on longer series:
    N = len(xny)
    y = _median_central(xny, width_points)
    # Cut out the central part again:
    y = y[wh:N - wh]
    return y
Ejemplo n.º 4
0
def theil_sen(x, y, n_samples=1e5):
    """Computes the Theil-Sen estimator for 2d data

    parameters:
        x: 1-d np array, the control variate
        y: 1-d np.array, the ind variate.
        n_samples: how many points to sample.

    This complexity is O(n**2), which can be poor for large n. We will perform a sampling
    of data points to get an unbiased, but larger variance estimator.
    The sampling will be done by picking two points at random, and computing the slope,
    up to n_samples times.
    """

    assert x.shape[0] == y.shape[0], "x and y must be the same shape."
    n = x.shape[0]

    i1 = np.random.randint(0, n, n_samples)
    i2 = np.random.randint(0, n, n_samples)
    slopes = _slope(x[i1], x[i2], y[i1], y[i2])

    slope_ = nanmedian(slopes)
    #find the optimal b as the median of y_i - slope*x_i
    intercepts = np.empty(n, dtype=float)
    for i in range(n):
        intercepts[i] = y[i] - slope_ * x[i]
    intercept_ = median(intercepts)

    return np.array([slope_, intercept_])
Ejemplo n.º 5
0
def simple_sky(sky):

    skymed = bt.median(sky)
    skymean = bt.nanmean(sky)
    skymod = 3. * skymed - 2. * skymean
    skystd = bt.nanstd(sky)

    return skymod, skystd, len(sky)
Ejemplo n.º 6
0
def moving_nanmedian(t, x, w, dt=None):
    """Calculate moving median of input with given window (in t-units)"""
    assert len(t) == len(x), "t and x must have the same length."
    if dt is None: dt = median(diff(t))
    width_points = int(w / dt)
    if width_points <= 1: return x
    if width_points % 2 == 0: width_points += 1
    if width_points >= len(x): return zeros_like(x) + nanmedian(x)
    return _median_central(x, width_points)
Ejemplo n.º 7
0
def theil_sen(x,y, sample= "auto", n_samples = 1e7):
    """
    Computes the Theil-Sen estimator for 2d data.
    parameters:
        x: 1-d np array, the control variate
        y: 1-d np.array, the ind variate.
        sample: if n>100, the performance can be worse, so we sample n_samples.
                Set to False to not sample.
        n_samples: how many points to sample.
    
    This complexity is O(n**2), which can be poor for large n. We will perform a sampling
    of data points to get an unbiased, but larger variance estimator. 
    The sampling will be done by picking two points at random, and computing the slope,
    up to n_samples times.
    
    """
    assert x.shape[0] == y.shape[0], "x and y must be the same shape."
    n = x.shape[0]
    
    if n < 100 or not sample:
        ix = np.argsort( x )
        slopes = np.empty( n*(n-1)*0.5 )
        for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( 
            i,j = ix[pair[0]], ix[pair[1]]
            slopes[c] = slope( x[i], x[j], y[i],y[j] )
    else:
        i1 = np.random.randint(0, n, n_samples)
        i2 = np.random.randint(0, n, n_samples)
        print '...checking for unwanted zeros...'
        zero_check=np.where(np.abs((x[i1]-x[i2])) != 0)
        i1=i1[zero_check]
        i2=i2[zero_check]
        print '...calculating slopes...'
        slopes = slope( x[i1], x[i2], y[i1], y[i2] )
        print 'slope min and max are:',np.amin(slopes),np.amax(slopes)
        
        histogram,bin_limits=np.histogram(slopes,bins=10000,range=(-2,2))
        #print histogram
        #c95=np.percentile(slopes,(5,95))
        #pdb.set_trace()
    
    slope_ = bottleneck.nanmedian( slopes )
    print '...done! Now finding intercepts...'
    #find the optimal b as the median of y_i - slope*x_i
    intercepts = np.empty( n )
    for c in xrange(n):
        intercepts[c] = y[c] - slope_*x[c]

    histogram_i,bin_limits_i=np.histogram(intercepts,bins=10000,range=(-2,2))
    #print histogram_i
    #c95i=np.percentile(intercepts,(5,95))
    #print cumul_i
    
    intercept_ = bottleneck.median( intercepts )

    return np.array( [slope_,intercept_]) #c95[0],c95[1],c95i[0],c95i[1]] )
Ejemplo n.º 8
0
def gap_fill(t, y, maxgap=np.inf):
    # Declare variables used:
    times_max = 0
    D = np.diff(t)
    time_tot = list([])
    data_tot = list([])
    ori_or_not = list([])

    # Calculate the desired regular step size:
    step = median(D)
    stepcut = 1.5 * step

    # test:
    if not np.isinf(maxgap):
        times_max = int((maxgap / 2) / step) + 1

    for i in range(len(t) - 1):
        # Add the original point:
        time_tot.append(t[i])
        data_tot.append(y[i])
        ori_or_not.append(1)

        d = D[i]

        if d > maxgap:
            # Insert half the maximum number of points in the beginning and end of gap:
            for j in range(1, times_max):
                time_tot.append(t[i] + j * step)
                data_tot.append(np.NaN)
                ori_or_not.append(0)
            # Insert half the maximum number of points in the beginning and end of gap:
            for j in range(times_max, 0, -1):
                time_tot.append(t[i + 1] - j * step)
                data_tot.append(np.NaN)
                ori_or_not.append(0)
        elif d > stepcut:
            # Calculate the number of points to be inserted and insert them:
            times = int(d / step) - 1
            for j in range(times):
                time_tot.append(t[i] + (j + 1) * step)
                data_tot.append(np.NaN)
                ori_or_not.append(0)

    # Special treatment of last point:
    time_tot.append(t[-1])
    data_tot.append(y[-1])
    ori_or_not.append(1)

    return np.array(time_tot), np.array(data_tot), np.array(ori_or_not,
                                                            dtype=bool)
Ejemplo n.º 9
0
def test_median(size):
    global data, mask, datamasked, list_of_data, list_of_mask

    print('Generate fake data')
    shape = (10, size, size)
    np.random.seed(42)
    data = np.random.normal(size=shape).astype(np.float32)
    mask = np.zeros(shape, dtype=np.uint16)
    data = data.reshape(shape[0], -1)
    mask = mask.reshape(shape[0], -1)
    datamasked = np.ma.array(data, mask=mask.astype(bool))
    list_of_data = list(data)
    list_of_mask = list(mask)

    print('Check results')
    outndcomb, _, _ = ndcombine(list_of_data,
                                list_of_mask,
                                combine_method='median',
                                reject_method='none')
    outnp = np.median(data, axis=0)
    outnpma = np.ma.median(datamasked, axis=0)
    outbn = bn.median(data, axis=0)

    np.testing.assert_array_equal(outndcomb, outnp)
    np.testing.assert_array_equal(outndcomb, outnpma)
    np.testing.assert_array_equal(outndcomb, outbn)

    print('Run perf tests')
    nb = 10
    kwargs = dict(globals=globals(), number=nb, repeat=5)

    def run(label, command):
        res = timeit.repeat(command, **kwargs)
        res = np.array(res) / nb
        print(f'- {label:20s}: {np.mean(res):.3f}s ± {np.std(res):.3f}s')

    run('np.median', 'np.median(data)')
    run('np.ma.median', 'np.ma.median(datamasked, axis=0)')
    run('bn.median', 'bn.median(data)')
    run(
        "ndcombine 1 thread",
        "ndcombine(list_of_data, list_of_mask, combine_method='median', "
        "reject_method='none', num_threads=1)")
    run(
        "ndcombine",
        "ndcombine(list_of_data, list_of_mask, combine_method='median', "
        "reject_method='none')")
Ejemplo n.º 10
0
def retrieve_phred_non_param(nbins,ratio,data,ratio_hist):
    pvals=[]
    n=10000
    p=0
    sampled_chromosomes=[]
    #pick chromosomes big enough to sample from
    for chromosome in data["chromosomes"]:
        if len(data[chromosome]["ratio"]) > 10*nbins and not "X" in chromosome and not "Y" in chromosome and abs(ratio_hist[chromosome][0]-1) < 0.1:
            sampled_chromosomes.append(chromosome)

    if not sampled_chromosomes:
        return(int(-10*math.log10(1/float(n))))

    chromosomes=list(sorted(numpy.random.choice(sampled_chromosomes,size=n)))
    simulated_positions=[]
    #simulate 
    for chromosome in sorted(sampled_chromosomes):
        simulated_positions+=list(numpy.random.randint(0,high=len(data[chromosome]["ratio"])-nbins,size=chromosomes.count(chromosome)))
    failed=0
    for i in range(0,n):
        chromosome=chromosomes[i]
        pos=simulated_positions[i]
        sim_bins=data[chromosome]["ratio"][pos:pos+nbins]

        if list(sim_bins).count(-1)/float(len(sim_bins)) >= 0.6:
            failed+=1
            continue

        sim_ratio=bottleneck.median(sim_bins[numpy.where(sim_bins >= 0)],axis=0)
        if ratio > 1 and sim_ratio >= ratio:
            p+=1
        elif ratio < 1 and sim_ratio <= ratio:
            p +=1

    if failed == n:
        return 1000

    p=p/float(n-failed)
    if not p:
        return(int(-10*math.log10(1/float(n-failed))))
    #normalise between 1000 and 1
    phred=int(-10*math.log10(p))
    return(phred)
Ejemplo n.º 11
0
    def fit_storage(self, data):  #X, Y, W=None):
        X, Y, W = data.X, data.Y, data.W if data.W else None
        Y = Y.astype(dtype=int)
        np.random.seed(self.seed)
        evd = {}

        for cls in range(len(self.domain.class_var.values)):
            self.target_class = cls
            print("estimating evd for class", cls)
            # repeat n-times
            max_vals = defaultdict(list)
            for i in range(self.n):
                print("{}/{}".format(i, self.n))
                # randomize class
                Yr = np.array(Y)
                np.random.shuffle(Yr)
                # learn rules
                new_data = Table.from_table(data.domain, data)
                new_data.Y = Yr
                super().fit_storage(new_data)
                for k in range(self.max_rule_length):
                    ki = k if k < len(self.inter_rules) else -1
                    max_vals[k + 1].extend(
                        [r.quality for r in self.inter_rules[ki]])
            # calculate extreme value distributions
            evd_cls = {0: EVDDist(0, 1, 0)}
            prev_median = 0
            for k in range(1, self.max_rule_length + 1):
                median = max(prev_median, bn.median(max_vals[k]))
                print("med", median)
                prev_median = median
                beta = 2
                mu = median + beta * np.log(np.log(2))
                if mu > 0.1:
                    evd_cls[k] = EVDDist(mu, beta, median)
                else:
                    evd_cls[k] = EVDDist(0, 1, 0)
            evd[cls] = evd_cls
            print()

        # returns an empty classifier
        return EVDFitterClassifier(evd, self.domain)
Ejemplo n.º 12
0
def _median_with_nan(values, *args, **kwargs):
    """ replace "median" if skipna is False
    
    numpy's median ignore NaNs as long as less than 50% 
    modify this behaviour and return NaN just as any other operation would
    """
    if _hasbottleneck:
        result = bottleneck.median(values, *args, **kwargs)
    else:
        result = np.median(values, *args, **kwargs)

    if anynan(values):
        if np.size(result) == 1: 
            result = np.nan
        else:
            axis = kwargs.pop('axis', None)
            nans = anynan(values, axis=axis) # determine where the nans should be
            result[nans] = np.nan

    return result
Ejemplo n.º 13
0
def theil_sen(x,y, sample= "auto", n_samples = 1e7):
    """
    Computes the Theil-Sen estimator for 2d data.
    parameters:
        x: 1-d np array, the control variate
        y: 1-d np.array, the ind variate.
        sample: if n>100, the performance can be worse, so we sample n_samples.
                Set to False to not sample.
        n_samples: how many points to sample.
    
    This complexity is O(n**2), which can be poor for large n. We will perform a sampling
    of data points to get an unbiased, but larger variance estimator. 
    The sampling will be done by picking two points at random, and computing the slope,
    up to n_samples times.
    
    """
    assert x.shape[0] == y.shape[0], "x and y must be the same shape."
    n = x.shape[0]
    
    if n < 100 or not sample:
        ix = np.argsort( x )
        slopes = np.empty( n*(n-1)*0.5 )
        for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( 
            i,j = ix[pair[0]], ix[pair[1]]
            slopes[c] = slope( x[i], x[j], y[i],y[j] )
    else:
        i1 = np.random.randint(0, n, n_samples)
        i2 = np.random.randint(0, n, n_samples)
        slopes = slope( x[i1], x[i2], y[i1], y[i2] )
        #pdb.set_trace()
    
    slope_ = bottleneck.nanmedian( slopes )
    #find the optimal b as the median of y_i - slope*x_i
    intercepts = np.empty( n )
    for c in xrange(n):
        intercepts[c] = y[c] - slope_*x[c]
    intercept_ = bottleneck.median( intercepts )

    return np.array( [slope_, intercept_] )
Ejemplo n.º 14
0
def theil_sen(x, y, sample="auto", n_samples=1e7):
    """
    Computes the Theil-Sen estimator for 2d data.
    parameters:
        x: 1-d np array, the control variate
        y: 1-d np.array, the ind variate.
        sample: if n>100, the performance can be worse, so we sample n_samples.
                Set to False to not sample.
        n_samples: how many points to sample.
    
    This complexity is O(n**2), which can be poor for large n. We will perform a sampling
    of data points to get an unbiased, but larger variance estimator. 
    The sampling will be done by picking two points at random, and computing the slope,
    up to n_samples times.
    
    """
    assert x.shape[0] == y.shape[0], "x and y must be the same shape."
    n = x.shape[0]

    if n < 100 or not sample:
        ix = np.argsort(x)
        slopes = np.empty(n * (n - 1) * 0.5)
        for c, pair in enumerate(itertools.combinations(range(n), 2)):  # it creates range(n) =(
            i, j = ix[pair[0]], ix[pair[1]]
            slopes[c] = slope(x[i], x[j], y[i], y[j])
    else:
        i1 = np.random.randint(0, n, n_samples)
        i2 = np.random.randint(0, n, n_samples)
        slopes = slope(x[i1], x[i2], y[i1], y[i2])
        # pdb.set_trace()

    slope_ = bottleneck.nanmedian(slopes)
    # find the optimal b as the median of y_i - slope*x_i
    intercepts = np.empty(n)
    for c in xrange(n):
        intercepts[c] = y[c] - slope_ * x[c]
    intercept_ = bottleneck.median(intercepts)

    return np.array([slope_, intercept_])
Ejemplo n.º 15
0
def compute_var_genes(adata, return_vect=True):
    """Compute variable genes for an indiviudal dataset


    Arguments:
        adata {[type]} -- AnnData object containing a signle dataset

    Keyword Arguments:
        return_vect {bool} -- Boolean to store as adata.var['higly_variance']
            or return vector of booleans for varianble gene membership (default: {False})

    Returns:
        np.ndarray -- None if saving in adata.var['highly_variable'], array of booleans if returning of length ngenes
    """

    if sparse.issparse(adata.X):
        median = csc_median_axis_0(sparse.csc_matrix(adata.X))
        variance = np.var(adata.X.A, axis=0)
    else:
        median = bottleneck.median(adata.X, axis=0)
        variance = np.var(adata.X, axis=0)
    bins = np.quantile(median,
                       q=np.linspace(0, 1, 11),
                       interpolation="midpoint")
    digits = np.digitize(median, bins, right=True)

    selected_genes = np.zeros_like(digits)
    for i in np.unique(digits):
        filt = digits == i
        var_tmp = variance[filt]
        bins_tmp = np.nanquantile(var_tmp, q=np.linspace(0, 1, 5))
        g = np.digitize(var_tmp, bins_tmp)
        selected_genes[filt] = (g >= 4).astype(float)

    if return_vect:
        return selected_genes.astype(bool)
    else:
        adata.var["highly_variable"] = selected_genes.astype(bool)
Ejemplo n.º 16
0
def random_epstein(values, k):

    S = set(values)
    #print len(S),len(values)
    Aleft = weighted_average(S)
    Aright = float('inf')

    while len(S) > 1:
        sampled = sample(S, 1)[0]
        vi = sampled[0]
        wi = sampled[1]
        E = set()
        X = set()
        Y = set()
        Z = set()

        for (vj, wj) in S:
            if wj == wi:
                delta_i_j = vj - vi
                A_i_j = float('-inf')
            else:
                delta_i_j = wi - wj
                A_i_j = (vi - vj) / (wi - wj)

            if delta_i_j == 0:
                E |= {(vj, wj)}
            elif (A_i_j <= Aleft and delta_i_j > 0) or (A_i_j >= Aright
                                                        and delta_i_j < 0):
                X |= {(vj, wj)}
            elif (A_i_j <= Aleft and delta_i_j < 0) or (A_i_j >= Aright
                                                        and delta_i_j > 0):
                Y |= {(vj, wj)}
        Z = S - X - Y - E
        n = len(S)
        # print '-----------'
        # print 'X = ',X
        # print 'Y = ',Y
        # print 'Z = ',Z
        # print 'E = ',E
        # print 'S = ',S
        # print '-----------'
        #raw_input('***************')
        while True:
            if len(Z) > 0:
                #A=sorted([(vi-vj)/(wi-wj) if wi!=wj else float('-inf') for (vj,wj) in Z])[len(Z)/2]
                A = median([
                    (vi - vj) / (wi - wj) if wi != wj else float('-inf')
                    for (vj, wj) in Z
                ])
                l = sorted([f((vj, wj), A) for (vj, wj) in S],
                           reverse=True)[:len(S) - k]

                #print l
                #raw_input('...')
                F_A = sum(l[:len(S) - k])

                if F_A == 0:
                    return A
                elif F_A > 0:
                    Aleft = A
                else:
                    Aright = A
                #print [Aleft,Aright]

                #####################RECOMPUTE X,Y,Z#####################
                to_remove_from_z = set()
                for (vj, wj) in Z:
                    delta_i_j = wi - wj
                    A_i_j = (vi - vj) / (wi - wj)
                    if (A_i_j <= Aleft
                            and delta_i_j > 0) or (A_i_j >= Aright
                                                   and delta_i_j < 0):
                        X |= {(vj, wj)}
                        to_remove_from_z |= {(vj, wj)}
                    elif (A_i_j <= Aleft
                          and delta_i_j < 0) or (A_i_j >= Aright
                                                 and delta_i_j > 0):
                        Y |= {(vj, wj)}
                        to_remove_from_z |= {(vj, wj)}
                Z = Z - to_remove_from_z
                #####################RECOMPUTE X,Y,Z#####################

                #print 'X = ',len(X),'Y = ',len(Y),'Z = ',len(Z),'E = ',len(E),'S = ',len(S)
                #raw_input('....')

            if ((len(X) + len(E)) >= (len(S) - k)) and k > 0:
                nb_to_remove = min(len(E), len(X) + len(E) - (len(S) - k))
                #print nb_to_remove,k,len(E)
                to_remove_E = set(sample(E, nb_to_remove))
                #print len(S)
                S = S - to_remove_E
                E = E - to_remove_E
                S = S - Y

                #print len(S)
                k = k - (len(Y) + nb_to_remove)
                Y = set()
                # print len(E),len(S)
                # raw_input('ooooo')
                # if k==0:
                # 	return weighted_average(S)

            elif (len(Y) + len(E)) >= k:
                nb_to_collapse = min(len(E), len(Y) + len(E) - k)
                values_to_collapse_E = set(sample(E, nb_to_collapse))
                E = E - values_to_collapse_E
                values_to_collapse = values_to_collapse_E | X
                S = S - values_to_collapse
                collapsed = (sum(x[0] for x in values_to_collapse),
                             sum(x[1] for x in values_to_collapse))
                X = {collapsed}
                S = S | {collapsed}

            if len(Z) <= len(S) / 32:
                break

    spop = S.pop()
    #print 'hey ! '
    return spop[0] / spop[1]
Ejemplo n.º 17
0
def filter_position_1d(time, flux, star_movement, timescale_position_smooth=None, dt=None):
	"""Filter the lightcurve for correlations in the stars position on the CCD."""

	# Check input:
	assert len(time)==len(flux), "TIME and FLUX should have the same number of elements."
	if not timescale_position_smooth is None and dt is None: dt = median(diff(time))

	# Settings:
	# num_knots = 15
	# min_points_per_knot = 3
	# spline_degree = 2
	# sigma_clip_spline = 4.0

	# Build up xpos chunk by chunk of the timeseries:
	xpos = np.empty_like(time, dtype='float64')
	for chk,chunk in enumerate(star_movement['chunks']):
		# Extract needed information:
		cl = star_movement['curvelength'][chk] # Sorted in position
		indx_possort = star_movement['indx_possort'][chk]
		indx_timesort = star_movement['indx_timesort'][chk]

		# Create smooth curve as flux as a function of curvelength:
		# The resulting "xp" will be sorted by position
		fl = flux[chunk][indx_possort]

		"""indx_finite = isfinite(cl) & isfinite(fl)
		knots = spline_set_knots(cl[indx_finite], num_knots)

		# Create the fixed knots for the spline function:

		knots = np.linspace(nanmin(cl[indx_finite]), nanmax(cl[indx_finite]), num_knots+2)[1:-2]

		# Remove knots if there is not at least 3 points between them:
		newknots = array([], dtype='float64')
		for i in range(len(knots)-1):
			indx_data_between_knots = (knots[i] < cl[indx_finite]) & (cl[indx_finite] < knots[i+1])
			if sum(indx_data_between_knots) > min_points_per_knot:
				newknots = append(newknots, knots[i])

		knots = newknots

		# Do a spline where all points are given the same weight:
		spline = LSQUnivariateSpline(cl[indx_finite], fl[indx_finite], knots, w=None, k=spline_degree)

		# Begin iterating so we can change the weights:
		for iterations in range(2):
			# Calculate weight of points based of their distance to
			# the previously calculated spline:
			d = np.abs( fl[indx_finite] - spline(cl[indx_finite]) )
			s = mad_to_sigma * median(d)
			w = 0.5*(np.sign(sigma_clip_spline - d/s) + 1) # Heaviside cutoff-function

			# Recalculate the spline, using the weights:
			spline = LSQUnivariateSpline(cl[indx_finite], fl[indx_finite], knots, w=w, k=spline_degree)

		# Evaluate the spline function at the curvelengths of the datapoints:
		# The spline function will return NaN if passed a NaN
		xp = spline(cl)
		"""

		lowess_frac = 0.1/ (nanmax(cl[np.isfinite(fl)]) - nanmin(cl[np.isfinite(fl)]))
		xp = lowess(fl, cl, frac=lowess_frac, it=3, is_sorted=True, return_sorted=False)

 		# Sort back into time-sorting and put NaN's back,
		# then low-pass filter the result:
		if timescale_position_smooth is None:
			xpos[chunk] = xp[indx_timesort]
		else:
			xpos[chunk] = moving_nanmedian(time[chunk], xp[indx_timesort], timescale_position_smooth, dt=dt)

	# Return the final time-sorted series:
	return xpos
Ejemplo n.º 18
0
def filter(t, x, quality=None, position=None, P=None, jumps=None, timescale_long=3.0, timescale_short=1/24, sigma_clip=4.5, scale_clip=5.0, scale_width=1.0, phase_smooth_factor=1000, transit_model=None, it=3):
	"""Main filter function.

	Parameters:
		t (ndarray): Time vector (days).
		x (ndarray): Flux vector.
		quality (ndarray, None): Quality vector (bit-flags) from Kepler data; default=None.
		position (ndarray, None): Centroid positions of star on CCD as two column list; default=None.
		P (ndarray): Known planetary period (days); default=None.
		jumps (list): List of known jumps in the flux (timestamp in days); default=None.
		timescale_long (float): Timescale of long filter in days; default=3.
		timescale_short (float): Timescale of short filter in days; default=1/24.
		sigma_clip (float): Sigma-clip threshold; default=4.5.
		scale_clip (float): Scale at which to switch between long and short filters; default=5.
		scale_width (float): Width of transition region between filters; default=1.
		phase_smooth_factor (float): Fraction of period to smooth phase curce with; default=1000.
		transit_model (ndarray): Full transit model to be used instead of smoothed phase curve; default=None.
		it (integer): Number of iterations between different filters. Default=3.

	Returns:
		tnew	   - New time vector with the same length as the input vectors.
		xnew	   - New flux vector with the same length as the input vectors.
		sigma	  - Vector of estmated errors on measurements.
		flags	  - Vector of KASOC flags.
		filt	   - Vector with the final filter applied (after jump removal).
		turnover   - Turnover function with weights to long and short filter.
	"""

	# Basic check of input:
	N = len(t)
	assert N==len(x), "TIME and DATA does not have the same length"
	if not transit_model is None:
		assert N==len(transit_model), "TRANSIT_MODEL is wrong length"
	if not quality is None:
		assert N==len(quality), "QUALITY is wrong length"
	if not position is None:
		if not isinstance(position, dict): position = {'pixels': position, 'break': np.array([], dtype='float64')}
		assert position['pixels'].shape==(N, 2), "POSITION must have the shape (N,2)"
	assert it > 0, "IT must be at least one."

	# Get the logger to use for printing messages:
	logger = logging.getLogger(__name__)

	# Sort the data in ascending order of time (This is needed for median filters to work)
	indx_sorttime = argsort(t)
	x = x[indx_sorttime] # data sorted after time
	t = t[indx_sorttime] # sorted time
	if not quality is None: quality = quality[indx_sorttime] # sorted quality
	if not position is None: position['pixels'] = position['pixels'][indx_sorttime, :] # sorted position

	# If not correcting position and transits, don't iterate:
	if position is None and transit_model is None and P is None:
		it = 1

	# Find median cadence:
	dt = median(diff(t))

	# Use the quality values to filter out bad values:
	if not quality is None:
		x, tmpJumps, flag_removed = filter_flags(t, x, quality, return_flags=True)
		if len(tmpJumps) > 0:
			if jumps is None:
				jumps = tmpJumps
			else:
				jumps = append(jumps, tmpJumps)
	else:
		flag_removed = ~isfinite(x)

	# Remove jumps:
	if not jumps is None:
		logger.info('Removing jumps...')
		x, jumps_flag, flag_jumps2 = remove_jumps(t, x, jumps, return_flags=True)

	# Fill gaps in timeseries with NaN
	# "ori" is a flag so xg[ori] will retrive the original points
	logger.info('Filling gaps...')
	tg, xg, ori = gap_fill(t, x, timescale_long)
	Ng = len(tg)

	# Calculate wide median filter and possibly filter out
	# flux changes correlated with stars position on CCD:
	if not position is None:
		logger.info('Extracting position information...')
		# Remove points that have been flagged as bad from positions:
		position['pixels'][flag_removed, :] = NaN
		# Fill the gaps in the position timeseries with NaNs:
		posg = empty((Ng, 2), dtype='float64'); posg.fill(NaN)
		posg[ori, :] = position['pixels']
		position['pixels'] = posg
		# Run subroutine which determines xlong and xpos using the positions:
		flag_bad_pos, star_movement = extract_star_movement_1d(tg, xg, position, dt=dt)

		# Number of columns to plot on the "decorrelation" plot:
		# NOTE: Not "+2" as Nchunks is the number of breaks and not the number of chunks
		ncols = star_movement['Nchunks'] + 1
	else:
		flag_bad_pos = zeros(Ng, dtype='bool')
		ncols = 1

	flux_ylim = np.percentile(x[isfinite(x)], [0.25, 99.75])

	# Prepare the "decorrelation" figure:
	ax1 = ax2 = None
	figsize = [8*1.7, 6*1.7]
	figsize[0] = figsize[0] * max(ncols/3, 1)
	figsize[1] = figsize[1] * max(it/3, 1)
	fig = plt.figure(figsize=figsize)
	fig.canvas.set_window_title('Decorrelation')
	fig.subplots_adjust(hspace=0.05)

	# Repeat the determination of xlong and xpos to better disentangle them:
	xpos = zeros(Ng, dtype='float64')
	xtransit = zeros(Ng, dtype='float64')
	xpos[flag_bad_pos] = NaN # Set points found to be bad to NaN so they wont contribute in the following
	for i in range(it):
		logger.info("Running %d iteration:", i+1)

		# Create long moving median, by removing previously found xpos and xtransit:
		logger.info('  Calculating long moving median...')
		xinp = xg - xpos - xtransit
		xlong = moving_nanmedian(tg, xinp, timescale_long, dt=dt)
		xlong[flag_bad_pos] = NaN

		# Create first column of plot with determination of xlong:
		ax1 = fig.add_subplot(it, ncols, ncols*i+1, sharex=ax1)
		ax1.scatter(tg, xinp, color='k', s=1, alpha=0.5)
		ax1.plot(tg, xlong, 'g-')
		ax1.set_xlim(tg[0], tg[-1])
		ax1.set_ylim(flux_ylim)
		ax1.set_ylabel(r'Flux (e$^-$/s)')
		plt.yticks(fontsize=10)
		ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
		if i==0: ax1.set_title(r'$x_\mathrm{long}$')
		if i == it-1:
			ax1.set_xlabel('Time (days)', fontsize=10)
			plt.xticks(fontsize=10)
		else:
			plt.setp(ax1.get_xticklabels(), visible=False)

		# Filter the timeseries for the star movement:
		if not position is None:
			logger.info('  Filtering star movements...')
			xinp = xg - xlong - xtransit
			xpos = filter_position_1d(tg, xinp, star_movement, dt=dt)

			for kc,chunk in enumerate(star_movement['chunks']):
				indx_possort = star_movement['indx_possort'][kc]
				curvelength_chunk = star_movement['curvelength'][kc]

				ax2 = fig.add_subplot(it, ncols, ncols*i+kc+2)

				ax2.scatter(curvelength_chunk, xinp[chunk][indx_possort], color='k', s=1, alpha=0.5)
				ax2.plot(curvelength_chunk, xpos[chunk][indx_possort], 'r-')

				plt.yticks(fontsize=10)
				if i==0: ax2.set_title('Position-flux #%d'%(kc+1))
				if i==it-1:
					ax2.set_xlabel('Curve length (pixels)', fontsize=10)
					plt.xticks(fontsize=10)
				else:
					plt.setp(ax2.get_xticklabels(), visible=False)

			# The next column with xpos as a function of time:
			ax3 = fig.add_subplot(it, ncols, ncols*(i+1), sharex=ax1)
			ax3.scatter(tg, xinp, color='k', s=1, alpha=0.5)
			ax3.plot(tg, xpos, 'r-')
			ax3.set_xlim(tg[0], tg[-1])
			plt.yticks(fontsize=10)
			ax3.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
			if i == 0: ax3.set_title(r'$x_\mathrm{pos}$')
			if i == it-1:
				ax3.set_xlabel('Time (days)', fontsize=10)
				plt.xticks(fontsize=10)
			else:
				plt.setp(ax3.get_xticklabels(), visible=False)

		# Calculate phase-curve, if periods are provided:
		if not P is None:
			logger.info("  Calculating phase curve...")
			xtransit = filter_phase(tg, xg-xlong-xpos, P, smooth_factor=phase_smooth_factor)
		elif not transit_model is None:
			# Create filter using transit model
			# Do it in this way since transit model is relative with respect to 1
			# Fill gaps of transit model the same way as the data:
			xtransit = np.ones(Ng)
			xtransit[ori] = transit_model
			filt = (xlong+xpos) * xtransit
			xtransit = filt - (xlong+xpos)

	# Save the figure:
	if _output_folder is not None:
		fig.savefig(os.path.join(_output_folder, _output_prefix+'decorrelation.'+_output_format), format=_output_format, bbox_inches='tight')
	if _output_format != 'native':
		plt.close(fig)

	# Make sure we have removed the bad datapoints:
	xg[flag_bad_pos] = NaN

	# Construct the final filter:
	filt = xlong + xtransit + xpos

	# Run the old KASOC filter to remove any potential unknown transits and sharp features:
	if not timescale_short is None:
		# Make a switch for long cadence data that puts a lower limit on timescale_short of 7 points (3.5 hours for LC)
		if timescale_short < 7*dt:
			logger.warning("WARNING: timescale_short is less than 7 points wide!")

		# Smooth the data with short moving median:
		logger.info("Calculating short moving median...")
		xshort = moving_nanmedian(tg, xg-filt, timescale_short, dt=dt)
		xshort_tilde = dc(xshort)
		xshort = filt + xshort

		# Create timeseries of the long filter, divided by the short filter:
		w4 = filt/xshort - 1

		# Smooth the timeseries using a very short filter to remove any very high frequency noise:
		w4_smooth_width = int(timescale_short/dt)
		w4 = smooth(w4, w4_smooth_width)
		w4 = smooth(w4, w4_smooth_width)
		w4 = smooth(w4, w4_smooth_width)

		# Calculate moving standard deviation of timeseries
		# in units of sigmas:
		w5 = moving_nanmedian(tg, np.abs(w4), timescale_short)
		snr = w5/nanmedian(w5)

		# Create "flag"/weight indicating how much of the short filter and the long filter should
		# be used at each timestep. Is a number between 0 (long filter) and 1 (short filter).
		if scale_width > 0:
			turnover = norm.cdf(snr, scale_clip, scale_width)
		else:
			# For zero width, use the Heaviside function:
			turnover = 0.5*(np.sign(snr-scale_clip) + 1)

		# Create final filter as weighted mean of the long and short filters:
		filt = (1-turnover)*filt + turnover*xshort

		# Plot the derived filter compoments:
		if not _output_folder is None:
			fig = plt.figure() # num='turnover'
			fig.canvas.set_window_title('turnover')
			fig.subplots_adjust(hspace=0.05)
			ax1 = plt.subplot(211)
			ax1.axhspan(scale_clip-scale_width, scale_clip+scale_width, facecolor='0.5', edgecolor=None, alpha=0.5)
			ax1.plot(tg, snr, 'b-')
			ax1.set_ylabel(r'$\sigma_w$', fontsize=10)
			ax1.set_title('Filter turnover function', fontsize=12)
			ax1.set_xlim(t[0], t[-1])
			ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
			plt.yticks(fontsize=10)
			plt.setp(ax1.get_xticklabels(), visible=False)
			# Axes showing the derived weights:
			ax2 = plt.subplot(212, sharex=ax1)
			ax2.plot(tg, turnover, 'b-')
			ax2.set_ylim(0, 1)
			ax2.set_ylabel('$c$', fontsize=10)
			ax2.set_xlabel('Time', fontsize=10)
			ax2.set_xlim(t[0], t[-1])
			ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
			plt.xticks(fontsize=10)
			plt.yticks(fontsize=10)
			if _output_format != 'native':
				fig.savefig(os.path.join(_output_folder, _output_prefix+'turnover.'+_output_format), format=_output_format, bbox_inches='tight')
				plt.close(fig)
	else:
		xshort = zeros(Ng)
		xshort_tilde = xshort
		turnover = zeros(Ng)

	# Flag with significant sharp and negative features (transits?):
	flag_transit = (turnover > 0.5) & (xshort < xlong+xpos+xtransit)

	# Plot the final filter:
	if not _output_folder is None:
		mask_long = isfinite(xlong)
		mask_short = isfinite(xshort)
		mask_filt = isfinite(filt)

		fig = plt.figure()
		fig.canvas.set_window_title('components')
		ax = fig.add_subplot(111)
		h1 = plt.scatter(t, x, color='k', s=2)
		h2, = plt.plot(tg[mask_long], xlong[mask_long], 'b-')
		h5, = plt.plot(tg, xlong+xpos+xtransit, 'y-')
		h3, = plt.plot(tg[mask_short], xshort[mask_short], 'g-')
		h4, = plt.plot(tg[mask_filt], filt[mask_filt], 'r-')
		ax.plot(tg[flag_transit], xg[flag_transit], 'go', markersize=2)
		plt.legend([h1, h2, h5, h3, h4], ['Data', r'$x_{\rm long}$', r'$x_{\rm pos}+x_{\rm transit}$', r'$x_{\rm short}$', 'Final filter'], fontsize=8, ncol=2, loc='best')
		ax.set_xlabel('Time', fontsize=10)
		ax.set_ylabel('Flux', fontsize=10)
		ax.set_xlim(t[0], t[-1])
		ax.set_ylim(flux_ylim)
		plt.xticks(fontsize=10)
		plt.yticks(fontsize=10)
		ax.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
		if _output_format != 'native':
			fig.savefig(os.path.join(_output_folder, _output_prefix+'components.'+_output_format), format=_output_format, bbox_inches='tight')
			plt.close(fig)

	# Apply final filter and convert to ppm:
	xg = 1e6*(xg/filt - 1)

	# Remove outliers using sigma-clipping:
	# The mean is already taken out, so we only
	# need to calculate the deviation from zero.
	logger.info("Calculating sigma...")
	flag_bad = array([False]*Ng, dtype='bool')
	absx = np.abs(xg)
	sigma = moving_nanmedian(tg, absx, timescale_long, dt=dt)
	if not sigma_clip is None:
		sigma_clip = mad_to_sigma * sigma_clip # less expensive to convert sigma_clip than sigma vector
		# 9. Estimate the point-to-point error from final timeseries:
		# We need to re-do it because bad data points might have biases
		# the previously calculculated sigmas
		flag_bad = (absx > sigma_clip*sigma)
		#############################
		while True:
			flag_rem = (absx > sigma_clip*sigma)
			if flag_rem.any():
				# Remove bad data points from timeseries:
				flag_bad[flag_rem] = True
				absx[flag_rem] = NaN
				sigma = moving_nanmedian(tg, absx, timescale_long, dt=dt)
			else:
				break
		#############################
		# Bad data points should also be NaN:
		xg[flag_bad] = NaN

	# Convert to proper sigma indsted of MAD:
	indx = ~isfinite(xg)
	sigma[indx] = NaN
	sigma = mad_to_sigma * smooth(sigma, int(timescale_long/dt))
	sigma[indx] = NaN

	# Return results:
	# Remove the gap-filled data again:
	x = xg[ori]
	sigma = sigma[ori]
	filt = filt[ori]
	flag_bad = flag_bad[ori]
	turnover = turnover[ori]
	flag_transit = flag_transit[ori]
	flag_bad_pos = flag_bad_pos[ori]
	xlong = xlong[ori]
	xpos = xpos[ori]
	xtransit = xtransit[ori]

	# Return this instead of xshort, so the filter is easier to
	# "disacemble" into the components, since this means that the
	# filter can be written as:
	#   filter = xlong + xpos + xtransit + xshort
	xshort = turnover * xshort_tilde[ori]

	# Create KASOC flag vector:
	quality_flags = zeros(N, dtype='int64')
	quality_flags[flag_removed] += 1
	if not jumps is None: quality_flags += flag_jumps2
	quality_flags[flag_bad] += 8
	quality_flags[flag_transit] += 16
	if not position is None:
		quality_flags[flag_bad_pos] += 32
		# Find the indicies of points just after position breaks:
		if len(star_movement['tbreaks']) >= 3:
			ibreak = searchsorted(t, star_movement['tbreaks'][1:-1])
			quality_flags[ibreak] += 64

	# Check that the extracted errorbars make sense:
	indx_invalid_sigma = (sigma < 1e-8)
	#indx_invalid_sigma = (sigma < 0.01*nanmedian(sigma))
	#nms = nanmedian(sigma)
	#fig = plt.figure()
	#ax = fig_addsubplot(111)
	#ax.plot(t, sigma, 'b-')
	#ax.axhline(0.01*nms, color='k', ls='--')
	#ax.axhline(0.05*nms, color='k', ls='--')
	#ax.set_ylabel(r'$\sigma$ (ppm)', fontsize=10)
	#ax.set_xlabel('Time', fontsize=10)
	#plt.close(fig)
	if np.any(indx_invalid_sigma):
		# Generate a warning message:
		number_invalid_sigma = np.sum(indx_invalid_sigma)
		try:
			logger.warning("Invalid SIGMAs extracted (%d points = %.2f%%). Timescales should maybe be adjusted.", number_invalid_sigma, 100*number_invalid_sigma/N)
			warnings.warn("Invalid SIGMAs extracted", InvalidSigmasWarning)
		except IOError:
			print("Something went wrong in the logging of invalid sigmas")
		# Set the timeseries to NaN where sigmas are invalid,
		# and add a flag (128) to the quality-flags:
		x[indx_invalid_sigma] = NaN
		sigma[indx_invalid_sigma] = NaN
		quality_flags[indx_invalid_sigma] += 128

	# Plot the final filtered timeseries:
	if not _output_folder is None:
		fig = plt.figure()
		fig.canvas.set_window_title('final filter')
		fig.subplots_adjust(hspace=0.05)
		ax1 = plt.subplot(211)
		ax1.plot(t, x, 'b.', markersize=2)
		ax1.set_xlim(t[0], t[-1])
		ax1.set_ylabel('Relative flux (ppm)', fontsize=10)
		ax1.set_title("Final timeseries", fontsize=12)
		ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
		plt.setp(ax1.get_xticklabels(), visible=False)
		plt.yticks(fontsize=10)
		ax2 = plt.subplot(212, sharex=ax1)
		ax2.plot(t, sigma, 'b-')
		ax2.set_ylabel(r'$\sigma$ (ppm)', fontsize=10)
		ax2.set_xlabel('Time', fontsize=10)
		ax2.set_xlim(t[0], t[-1])
		plt.xticks(fontsize=10)
		plt.yticks(fontsize=10)
		ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
		if _output_format != 'native':
			fig.savefig(os.path.join(_output_folder, _output_prefix+'final.'+_output_format), format=_output_format, bbox_inches='tight')
			plt.close(fig)

	# Return everything needed:
	return t, x, sigma, quality_flags, filt, turnover, xlong, xpos, xtransit, xshort
Ejemplo n.º 19
0
def filter_phase(t, x, Plist, smooth_factor=1000):
	"""
	Filter out specific periods by smoothing the phase-curve.

	Parameters:
		t (ndarray): Time vector (days).
		x (ndarray): Flux vector.
		P (list): List of periods to remove.
		smooth_factor (float, optional): Factor of phase to use as smooth width.

	Returns:
		Filter flux vector that can be removed from timeseries.

	Note:
		Does not require time to be sorted.
		Can handle NaN in flux vector.
	"""

	# Prepare arrays:
	Plist = np.atleast_1d(Plist) # Hack to handle 0-dim input
	Np = len(Plist)
	Nt = len(t)
	phase = zeros((Np,Nt), dtype='float64')
	indx = zeros((Np,Nt), dtype='int')
	indx_inv = zeros((Np,Nt), dtype='int')
	phase_tot = zeros(Nt, dtype='float64')
	phase_smooth_t = zeros((Np,Nt), dtype='float64')
	dphase = zeros(Np, dtype='float64')

	# Loop through periods to be removed:
	for k in range(Np):
		# Calculate the phase and sort it:
		phase[k] = mod(t, Plist[k])
		indx[k] = argsort(phase[k])
		indx_inv[k] = argsort(indx[k])
		dphase[k] = median(diff( phase[k,indx[k]] ))

		# Calculate smooth version of the phase curve:
		phase_smooth = _filter_single_phase(phase[k,indx[k]], x[indx[k]]-phase_tot[indx[k]], Plist[k]/smooth_factor, dphase[k])
		# Un-sort phase_smoooth back to time-sorted order:
		phase_smooth_t[k] = phase_smooth[indx_inv[k]]
		# Add to the total phase filter:
		phase_tot += phase_smooth_t[k,:]

		# If removing multiple periods perform iterative procedure where
		# phase curves are added and removed to avoid cross-talk between periods:
		if k != 0:
			for j in range(k):
				# Add the transit back into to the timeseries (by subtracting it from the filter):
				phase_tot -= phase_smooth_t[j,:]
				# Re-calculate the phase curve of the transit:
				phase_smooth = _filter_single_phase(phase[j,indx[j]], x[indx[j]]-phase_tot[indx[j]], Plist[j]/smooth_factor, dphase[j])
				phase_smooth_t[j] = phase_smooth[indx_inv[j]]
				# Remove the transit again:
				phase_tot += phase_smooth_t[j,:]

	# Make plots of phase curves:
	if not _output_folder is None:
		# Find the point on the smoothed curve that deviates the most from zero:
		imax = nanargmax(np.abs(phase_smooth_t), axis=1)

		s = nanstd(x)
		fig = plt.figure()
		fig.canvas.set_window_title('phasecurve')
		fig.subplots_adjust(hspace=0.05)
		for k,P in enumerate(Plist):
			# Plot phasecurve for this period:
			ax = plt.subplot(Np, 1, k+1)
			ax.plot(phase[k]/P, x, 'k.', markersize=2) # No need to sort if we only plot points
			ax.plot(phase[k,indx[k]]/P, phase_smooth_t[k,indx[k]], 'r-')
			ax.axvline(phase[k,imax[k]]/P, color='b', linestyle='--') # Line indicating the (likely) planet transit
			ax.set_xlim(0, 1)
			ax.set_ylim(-6*s, 6*s)
			ax.text(0.02, 0.97, 'P = %f d'%(P), horizontalalignment='left', verticalalignment='top', transform=ax.transAxes, backgroundcolor='w', color='k')
			if k!=Np-1: plt.setp(ax.get_xticklabels(), visible=False)
		ax.set_xlabel('Phase')
		fig.text(0.03, 0.5, u'Flux (counts/s)', ha='center', va='center', rotation='vertical', transform=fig.transFigure)
		if _output_format != 'native':
			fig.savefig(os.path.join(_output_folder, _output_prefix+'phasecurve.'+_output_format), format=_output_format, bbox_inches='tight')
			plt.close(fig)

	# Return the total time-sorted phase curve:
	return phase_tot
Ejemplo n.º 20
0
def extract_star_movement_1d(time, flux, position, dt=None, rapid_movement_sigma_clip=5.0, pixel_off_clip=15.0):
	"""Extract information about star movement on CCD to be used later.

	Args:
		time:      Vector of timestamps.
		flux:      Vector of flux values.
		position:  Nx2 matrix of (x,y) positions of star on CCD.
		dt:        Mean distance between points in the time vector.

	Returns:
		flags: Flags indicating bad data points.
		star_movement: Object containing info to be passed to @filter_position_1d.
	"""

	# Logger for printing messages:
	logger = logging.getLogger(__name__)

	# If dictionary is given, split it into the components:
	if isinstance(position, dict):
		position_breaks = np.atleast_1d(position['break'])
		position = np.atleast_2d(position['pixels'])
	else:
		position_breaks = np.array([], dtype='float64')

	# Check input:
	Ng = len(time)
	assert len(flux)==Ng, "TIME and FLUX should have the same number of elements."
	assert position.shape==(Ng,2), "TIME and POSITION should have the same number of elements."
	if dt is None: dt = median(diff(time))

	# Since many of the routines used here can not handle NaN values,
	# we start by removing all NaN values from the input, but store their
	# location so they can be inserted again later on:
	indx_finite = np.all(isfinite(position), axis=1)
	if not any(indx_finite): raise Exception("No valid positions")

	# Check that all the chunks defined by the breaks actually contain data:
	position_breaks = np.sort(position_breaks) # Make sure it is sorted in time
	Estart = append(append(time[0], position_breaks), time[-1]+dt/2)
	tbreaks = np.array(time[0], dtype='float64')
	for chk in range(1, len(Estart)):
		chunk = (time >= Estart[chk-1]) & (time < Estart[chk])
		if any(indx_finite & chunk):
			tbreaks = append(tbreaks, Estart[chk])
	Nchunks = len(np.atleast_1d(tbreaks))
	if Nchunks < 2:
		tbreaks = np.array([time[0], time[-1]+dt/2])
		Nchunks = 2

	# Plot the position of the star as a function of time:
	fig = plt.figure()
	fig.canvas.set_window_title('Pixel positions vs time')
	fig.subplots_adjust(hspace=0.05)
	ax1 = fig.add_subplot(211)
	#ax1.scatter(time, position[:,0], color='b', s=1)
	plt.yticks(fontsize=10)
	ax2 = fig.add_subplot(212, sharex=ax1)
	#ax2.scatter(time, position[:,1], color='r', s=1)
	for tbreak in tbreaks:
		ax1.axvline(tbreak, color='k', linestyle='--')
		ax2.axvline(tbreak, color='k', linestyle='--')
	ax1.set_ylabel('Row (pixels)', fontsize=10)
	ax2.set_ylabel('Column (pixels)', fontsize=10)
	ax2.set_xlabel('Time (days)', fontsize=10)
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)
	plt.setp(ax1.get_xticklabels(), visible=False)
	ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
	ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))

	# Initiate the Principle Component Analysis:
	pca = PCA(n_components=2)

	# Initiate the NearestNeighbors algorithm:
	NN = NearestNeighbors(n_neighbors=2, algorithm='kd_tree')

	# Prepare the plot which will be filled in the loop below:
	fig1 = plt.figure(figsize=(1.7*8,6))
	fig1.canvas.set_window_title('Pixel positions')
	fig1ax1 = fig1.add_subplot(121)
	#fig1ax1.scatter(position[:,0], position[:,1], color='k', s=1)
	fig1ax1.set_xlabel('$x$ (pixels)', fontsize=10)
	fig1ax1.set_ylabel('$y$ (pixels)', fontsize=10)
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)
	fig1ax1.axis('equal')
	fig1ax2 = fig1.add_subplot(122)
	fig1ax2.set_xlabel(r'$x^\prime$ (pixels)', fontsize=10)
	fig1ax2.set_ylabel(r'$y^\prime$ (pixels)', fontsize=10)
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)
	fig1ax2.axis('equal')

	fig2 = plt.figure()
	fig2.canvas.set_window_title('Position changes')
	fig2.subplots_adjust(hspace=0.05)
	fig2ax1 = fig2.add_subplot(211)
	fig2ax1.set_xlim(time[0], time[-1])
	fig2ax1.set_ylabel('$ds/dt$', fontsize=10)
	fig2ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
	plt.yticks(fontsize=10)
	plt.setp(fig2ax1.get_xticklabels(), visible=False)
	fig2ax2 = fig2.add_subplot(212)
	fig2ax2.scatter(time, flux, color='k', s=1, alpha=0.5)
	fig2ax2.set_xlim(time[0], time[-1])
	fig2ax2.set_ylabel('Flux', fontsize=10)
	fig2ax2.set_xlabel('Time (days)', fontsize=10)
	fig2ax2.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)

	fig3 = plt.figure()
	fig3.canvas.set_window_title('Position sigma clipping')
	fig3ax1 = fig3.add_subplot(211)
	fig3ax1.set_xlabel('Time (days)', fontsize=10)
	fig3ax1.set_ylabel('Nearest neighbor distance (pixels)', fontsize=10)
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)
	fig3ax1.xaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter(useOffset=False))
	fig3ax1.set_xlim(time[0], time[-1])
	fig3ax2 = fig3.add_subplot(212)
	fig3ax2.set_xlabel('$x$ (pixels)', fontsize=10)
	fig3ax2.set_ylabel('$y$ (pixels)', fontsize=10)
	plt.xticks(fontsize=10)
	plt.yticks(fontsize=10)
	fig3ax2.axis('equal')

	fig4 = plt.figure()
	fig4.canvas.set_window_title('Position PCA axis')

	flag_bad_pos = zeros(Ng, dtype='bool')
	curvelength = []
	all_indx_possort = []
	all_indx_timesort = []
	all_chunks = []
	for chk in range(Nchunks-1):
		# Cut out positions that are in this chunk and are valid:
		chunk = (time >= tbreaks[chk]) & (time < tbreaks[chk+1])
		Nc = int(sum(chunk))
		indx_chunk_finite = indx_finite & chunk # indicies that are in chunk and finite
		pos = position[indx_chunk_finite, :]
		t = time[indx_chunk_finite]

		# Detect points where position changes rapidly (position tweek):
		# Calculate 2D distance between points and the derivative with respect to time:
		ds = nansum(diff(pos, axis=0)**2, axis=1)
		dsdt = np.sqrt(ds) / diff(t)
		dsdt = append(dsdt[0], dsdt)
		m = median(dsdt)
		absdsdt = np.abs(dsdt - m)
		rapid_threshold = rapid_movement_sigma_clip * mad_to_sigma * median(absdsdt)

		# Find the points where the position is changing rapidly:
		indx_rapid = absdsdt > rapid_threshold

		# Find points where x or y pixel position is far from median
		x_pos = pos[:,0]
		x_pos_med = np.nanmedian(x_pos)
		y_pos = pos[:,1]
		y_pos_med = np.nanmedian(y_pos)
		indx_pos_off_x = (np.abs(x_pos-x_pos_med)>pixel_off_clip)
		indx_pos_off_y = (np.abs(y_pos-y_pos_med)>pixel_off_clip)

		indx_pos_off = indx_pos_off_x + indx_pos_off_y
		indx_bad = indx_pos_off + indx_rapid
		flag_bad_pos[indx_chunk_finite] = indx_bad



		# Add to plot:
		ax1.scatter(t[~indx_bad], x_pos[~indx_bad], color='b', s=1)
		ax2.scatter(t[~indx_bad], y_pos[~indx_bad], color='r', s=1)

		fig2ax1.scatter(t[~indx_bad], dsdt[~indx_bad], color='k', s=1, alpha=0.5)
		fig2ax1.plot([t[0], t[-1]], [m+rapid_threshold, m+rapid_threshold], 'r--')
		fig2ax1.plot([t[0], t[-1]], [m-rapid_threshold, m-rapid_threshold], 'r--')

		# Remove points from the position vector used in the following:
		#pos_nn_pca = pos[~indx_pos_off, :]
		#t_nn_pca = t[~indx_pos_off]
		#print(len(pos_nn_pca), len(t_nn_pca))


		# Use nearest neighbour search to find distances between positions that are larger than the norm:
		distances, indices = NN.fit(pos).kneighbors(pos)
		#distances, indices = NN.fit(pos_nn_pca).kneighbors(pos_nn_pca)
		ndist = distances[:, 1]
		ndist -= median(ndist)
		distance_threshold = 4*mad_to_sigma*median(np.abs(ndist))
		indx_good = ndist < distance_threshold

		indx_good[indx_pos_off] = False


		fig3ax1.scatter(t[indx_good], ndist[indx_good], color='k', s=1, alpha=0.5)
		fig3ax1.scatter(t[~indx_good], ndist[~indx_good], color='r', s=2)
		fig3ax1.plot([t[0], t[-1]], [distance_threshold, distance_threshold], 'r--')
		fig3ax2.scatter(pos[indx_good,0], pos[indx_good,1], color='k', s=1, alpha=0.5)


		# Use Principal component Analysis to rotate positions
		pca.fit(pos[indx_good, :])

		# Apply the PCA model to the valid positions:
		pos2 = pca.transform(pos)

		# Create smooth curve along the movement:
		# TODO: This is sorting one time more than should be nessacery.
		cl_per05, cl_per95 = np.nanpercentile(pos2[:,0], [5, 95])
		lowess_frac = 0.1/(cl_per95 - cl_per05)
		logger.debug("LOWESS Fraction=%f, (%f, %f)", lowess_frac, cl_per05, cl_per95)
		poscurve1 = lowess(pos2[:,1], pos2[:,0], frac=lowess_frac, it=3, is_sorted=False, return_sorted=False, missing='drop')
		poscurve1 = np.column_stack((pos2[:,0], poscurve1))
		chi2_1 = nansum((pos2[:,1]-poscurve1[:,1])**2)

		cl_per05, cl_per95 = np.nanpercentile(pos2[:,1], [5, 95])
		lowess_frac = 0.1/(cl_per95 - cl_per05)
		logger.debug("LOWESS Fraction=%f, (%f, %f)", lowess_frac, cl_per05, cl_per95)
		poscurve2 = lowess(pos2[:,0], pos2[:,1], frac=lowess_frac, it=3, is_sorted=False, return_sorted=False, missing='drop')
		poscurve2 = np.column_stack((pos2[:,1], poscurve2))
		chi2_2 = nansum((pos2[:,0]-poscurve2[:,1])**2)

		# Plot the
		fig4ax1 = fig4.add_subplot(Nchunks-1, 2, 2*chk+1)
		fig4ax1.scatter(pos2[indx_good,0], pos2[indx_good,1], color='k', s=1, alpha=0.3)
		fig4ax1.scatter(poscurve1[indx_good,0], poscurve1[indx_good,1], color='r', s=2)
		fig4ax1.set_title(r"$\chi^2 = %f$" % chi2_1, fontsize=12)
		plt.xticks(fontsize=10)
		plt.yticks(fontsize=10)
		fig4ax1.axis('equal')
		fig4ax2 = fig4.add_subplot(Nchunks-1, 2, 2*chk+2)
		fig4ax2.scatter(pos2[indx_good,1], pos2[indx_good,0], color='k', s=1, alpha=0.3)
		fig4ax2.scatter(poscurve2[indx_good,0], poscurve2[indx_good,1], color='g', s=2)
		fig4ax2.set_title(r"$\chi^2 = %f$" % chi2_2, fontsize=12)
		plt.xticks(fontsize=10)
		plt.yticks(fontsize=10)
		fig4ax2.axis('equal')

		pos3 = empty(Nc, dtype='float64'); pos3.fill(NaN)
		if chi2_1 <= chi2_2:
			# Sort everything along the principle axis:
			pos3[indx_finite[chunk]] = pos2[:,0]
			poscurve = poscurve1[argsort(pos2[:,0]), :]
			indx_good2 = indx_good[argsort(pos2[:,0])]
		else:
			# Sort everything along the principle axis:
			pos3[indx_finite[chunk]] = pos2[:,1]
			poscurve = poscurve2[argsort(pos2[:,1]), :]
			indx_good2 = indx_good[argsort(pos2[:,1])]

				#print('chis', chi2_1, chi2_2)
				#print('Fin poscurve', poscurve[0:10,:])

		# Create version of position curve in original pixel-space:
		poscurve_pixels = pca.inverse_transform(poscurve)

		# Make vectors that will sort this chunk accouding to position and time:
		indx_possort = argsort(pos3)
		indx_timesort = argsort(indx_possort)

		# Calculate length along curve for each timestamp:
		dx = diff(poscurve[:,0])
		dy = diff(poscurve[:,1])
		ds = np.sqrt(dx**2 + dy**2) # Length of each segment
		cl = empty(Nc, dtype='float64'); cl.fill(NaN)
		cl[indx_finite[chunk][indx_possort]] = append(0, np.cumsum(ds)) # length of curve at each knot


		# Gather vectors that will sort this chunk acording to position and time:
		all_indx_possort.append(indx_possort)
		all_indx_timesort.append(indx_timesort)
		all_chunks.append(chunk)
		curvelength.append(cl)

		# Add to plots:
		fig1ax1.scatter(x_pos[indx_good], y_pos[indx_good], color='k', s=1, alpha=0.5)
		fig1ax1.plot(poscurve_pixels[indx_good2,0], poscurve_pixels[indx_good2,1], 'r-')
		fig1ax2.scatter(pos2[indx_good,0]+pca.mean_[0], pos2[indx_good,1]+pca.mean_[1], color='k', s=1)
		fig1ax2.plot(poscurve[indx_good2,0]+pca.mean_[0], poscurve[indx_good2,1]+pca.mean_[1], 'r-')

	# Add the bad points to figure 2 lower panel:
	fig2ax2.scatter(time[flag_bad_pos], flux[flag_bad_pos], color='r', s=2)

	if _output_format != 'native':
		fig.savefig(os.path.join(_output_folder, _output_prefix+'pixel_time.'+_output_format), format=_output_format, bbox_inches='tight')
		plt.close(fig)
		fig1.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions.'+_output_format), format=_output_format, bbox_inches='tight')
		plt.close(fig1)
		fig2.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_changes.'+_output_format), format=_output_format, bbox_inches='tight')
		plt.close(fig2)
		fig3.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_sigma_clipping.'+_output_format), format=_output_format, bbox_inches='tight')
		plt.close(fig3)
		fig4.savefig(os.path.join(_output_folder, _output_prefix+'pixel_positions_pca.'+_output_format), format=_output_format, bbox_inches='tight')
		plt.close(fig4)

	# Gather the output needed by other functions into dictionary
	# that will be passed around between functions:
	star_movement = {
		'curvelength': curvelength,
		'indx_possort': all_indx_possort,
		'indx_timesort': all_indx_timesort,
		'tbreaks': tbreaks,
		'Nchunks': Nchunks,
		'chunks': all_chunks
	}

	return flag_bad_pos, star_movement
Ejemplo n.º 21
0
def getFeaturesFromVectors_pair(vectorData, seg1, seg2, segData):
    X = myX()
    segDilated = segData.segDilated
    segBoundSize = segData.segBoundSize
    seggSizes = segData.segSizes
    EdgeMapList = vectorData.EdgeMapList
    vectors = vectorData.segVectors
    clustersL2 = vectorData.segClustersL2
    ratios = vectorData.ratios
    regFeatures = []
    boundLine = np.logical_and(segDilated[seg1], segDilated[seg2])
    for e in EdgeMapList:
        bound = e[boundLine]
        bound = bound[bound > 0]
        regFeatures.append(np.mean(bound))  #Feature mean bound dist.

    boundOverlap1 = bound.size / segBoundSize[seg1]
    boundOverlap2 = bound.size / segBoundSize[seg2]
    regFeatures.append(max(boundOverlap1,
                           boundOverlap2))  #Feature max overlap with bound
    regFeatures.append(bound.size)  #Feature bound size
    size1 = seggSizes[seg1]
    size2 = seggSizes[seg2]
    regFeatures.append(size1 + size2)  #Feature New seg area

    X.regFeatures = regFeatures
    for ratioIdx, ratio in enumerate(ratios):
        cnnFeatures = []
        for numLayer in range(0, len(vectors[seg1][ratioIdx])):
            layerFeatures = []
            pair_dist = scipy.spatial.distance.cdist(
                vectors[seg1][ratioIdx][numLayer],
                vectors[seg2][ratioIdx][numLayer])
            layerFeatures.append(
                np.min(pair_dist))  #Feature L2 min dist in vec rep.
            layerFeatures.append(
                np.max(pair_dist))  #Feature L2 max dist in vec rep.
            layerFeatures.append(
                np.mean(pair_dist))  #Feature L2 average dist in vec rep.
            layerFeatures.append(
                bn.median(pair_dist))  #Feature L2 median dist in vec rep.
            layerFeatures.append(
                np.sqrt(
                    np.sum((clustersL2[seg1][ratioIdx][numLayer] -
                            clustersL2[seg2][ratioIdx][numLayer]
                            )**2)))  #Feature L2 dist between L2 clusters
            pair_dist = scipy.spatial.distance.cdist(
                vectors[seg1][ratioIdx][numLayer],
                vectors[seg2][ratioIdx][numLayer],
                metric='cosine')
            layerFeatures.append(
                np.min(pair_dist))  # Feature cosine min dist in vec rep.
            layerFeatures.append(
                np.max(pair_dist))  # Feature cosine max dist in vec rep.
            layerFeatures.append(
                np.mean(pair_dist))  # Feature cosine average dist in vec rep.
            layerFeatures.append(
                bn.median(pair_dist))  # Feature cosine median dist in vec rep.
            layerFeatures.append(
                cosine_distances(
                    np.array([clustersL2[seg1][ratioIdx][numLayer]]),
                    np.array([
                        clustersL2[seg2][ratioIdx][numLayer]
                    ]))[0][0])  #Feature cosine dist between L2 clusters

            cnnFeatures.append(layerFeatures)
        X.cnnFeatures[ratio] = cnnFeatures
    ImageFeatures = []
    ImageFeatures.append(
        np.sqrt((vectorData.segL[seg1] -
                 vectorData.segL[seg2])**2))  #Feature L channel dist
    ImageFeatures.append(
        np.sqrt((vectorData.segA[seg1] -
                 vectorData.segA[seg2])**2))  #Feature A channel dist
    ImageFeatures.append(
        np.sqrt((vectorData.segB[seg1] -
                 vectorData.segB[seg2])**2))  #Feature B channel dist
    X.ImageFeatures = ImageFeatures
    return X
Ejemplo n.º 22
0
def mad(datin, z=7, deriv=0, nozero=False):
    """
        Median absolute deviation test, either on raw values, 1st or 2nd derivatives.
        Returns mask with false everywhere except where <(median-MAD*z/0.6745) or >(md+MAD*z/0.6745).


        Definition
        ----------
        def mad(datin, z=7, deriv=0, nozero=False):


        Input
        -----
        datin      array; mad acts on axis=0


        Optional Input
        --------------
        z          Input is allowed to deviate maximum z standard deviations from the median (default: 7)
        deriv      0: Act on raw input; 1: Use first derivatives; 2: Use 2nd derivatives
        nozero     If True: exclude 0. from input


        Output
        ------
        mask with false everywhere except where input deviates more than z standard deviations from median


        Restrictions
        ------------
        If input is an array then it mad checks along the zeroth axis for outlier.

        1st derivative is
            d = datin[1:n]-datin[0:n-1]
        because mean of left and right would give 0 for spikes.

        If all(d.mask==True) then return d.mask, which is all True


        Examples
        --------
        >>> import numpy as np
        >>> y = np.array([-0.25,0.68,0.94,1.15,2.26,2.35,2.37,2.40,2.47,2.54,2.62,
        ...               2.64,2.90,2.92,2.92,2.93,3.21,3.26,3.30,3.59,3.68,4.30,
        ...               4.64,5.34,5.42,8.01],dtype=np.float)

        # Normal MAD
        >>> print(mad(y))
        [False False False False False False False False False False False False
         False False False False False False False False False False False False
         False False]

        >>> print(mad(y,z=4))
        [False False False False False False False False False False False False
         False False False False False False False False False False False False
         False  True]

        >>> print(mad(y,z=3))
        [ True False False False False False False False False False False False
         False False False False False False False False False False False False
          True  True]

        # MAD on 2nd derivatives
        >>> print(mad(y,z=4,deriv=2))
        [False False False False False False False False False False False False
         False False False False False False False False False False False  True]

        # direct usage
        >>> my = np.ma.array(y, mask=mad(y,z=4))
        >>> print(my)
        [-0.25 0.68 0.94 1.15 2.26 2.35 2.37 2.4 2.47 2.54 2.62 2.64 2.9 2.92 2.92
         2.93 3.21 3.26 3.3 3.59 3.68 4.3 4.64 5.34 5.42 --]

        # MAD on several dimensions
        >>> yy = np.transpose(np.array([y,y]))
        >>> print(np.transpose(mad(yy,z=4)))
        [[False False False False False False False False False False False False
          False False False False False False False False False False False False
          False  True]
         [False False False False False False False False False False False False
          False False False False False False False False False False False False
          False  True]]

        >>> yyy = np.transpose(np.array([y,y,y]))
        >>> print(np.transpose(mad(yyy,z=3)))
        [[ True False False False False False False False False False False False
          False False False False False False False False False False False False
           True  True]
         [ True False False False False False False False False False False False
          False False False False False False False False False False False False
           True  True]
         [ True False False False False False False False False False False False
          False False False False False False False False False False False False
           True  True]]

        # Masked arrays
        >>> my = np.ma.array(y, mask=np.zeros(y.shape))
        >>> my.mask[-1] = True
        >>> print(mad(my,z=4))
        [True False False False False False False False False False False False
         False False False False False False False False False False False False
         False --]

        >>> print(mad(my,z=3))
        [True False False False False False False False False False False False
         False False False False False False False False False False False True
         True --]

        # Arrays with NaNs
        >>> ny = y.copy()
        >>> ny[-1] = np.nan
        >>> print(mad(ny,z=4))
        [ True False False False False False False False False False False False
         False False False False False False False False False False False False
         False False]

        >>> print(mad(ny,z=3))
        [ True False False False False False False False False False False False
         False False False False False False False False False False False  True
          True False]

        # Exclude zeros
        >>> zy = y.copy()
        >>> zy[1] = 0.
        >>> print(mad(zy,z=3))
        [ True  True False False False False False False False False False False
         False False False False False False False False False False False False
          True  True]

        >>> print(mad(zy,z=3,nozero=True))
        [ True False False False False False False False False False False False
         False False False False False False False False False False False False
          True  True]


        License
        -------
        This file is part of the JAMS Python package, distributed under the MIT
        License. The JAMS Python package originates from the former UFZ Python library,
        Department of Computational Hydrosystems, Helmholtz Centre for Environmental
        Research - UFZ, Leipzig, Germany.

        Copyright (c) 2011-2013 Matthias Cuntz - mc (at) macu (dot) de

        Permission is hereby granted, free of charge, to any person obtaining a copy
        of this software and associated documentation files (the "Software"), to deal
        in the Software without restriction, including without limitation the rights
        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        copies of the Software, and to permit persons to whom the Software is
        furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be included in all
        copies or substantial portions of the Software.

        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
        SOFTWARE.


        History
        -------
        Written,  MC, Nov 2011
        Modified, MC, May 2012 - act on axis=0 of array
                  MC, Jun 2012 - axis=0 did not always work: spread md and MAD to input dimensions
                  MC, Jun 2012 - use np.diff, remove spreads
                  MC, Feb 2013 - ported to Python 3
                  MC & JM, Jul 2013 - loop over second dimension for medians, faster than array calculations :-(
                                      but use bottleneck for speed :-)
                  MC, Jul 2013 - (re-)allow masked arrays and NaNs in arrays
                  MC, Oct 2013 - nozero, bug in NaN treatment with dim=1
    """

    if nozero:
        idatin = datin.copy()
        ii = np.where(idatin == 0.)[0]
        if ii.size > 0: idatin[ii] = np.nan
    else:
        idatin = datin
    sn = list(np.shape(idatin))
    n  = sn[0]
    if deriv == 0:
        m      = n
        d      = idatin
    elif deriv == 1:
        m      = n-1
        sm     = sn
        sm[0]  = m
        d      = np.diff(idatin, axis=0)
    elif deriv == 2:
        m      = n-2
        sm     = sn
        sm[0]  = m
        d      = np.diff(idatin, n=2, axis=0)
    else:
        raise ValueError('Unimplemented option.')


    # Shortcut if all masked
    ismasked = type(d) == np.ma.core.MaskedArray
    if not ismasked:
        ii = np.where(~np.isfinite(d))[0]
        d  = np.ma.array(d)
        if ii.size > 0: d[ii] = np.ma.masked

    if np.all(d.mask == True):
        if ismasked:
            return d.mask
        else:
            return np.ones(d.shape, dtype=np.bool)

    # Median
    oldsettings = np.geterr()
    np.seterr(invalid='ignore')
    if d.ndim == 1:
        try:
            import bottleneck as bn
            dd = d.compressed()
            md = bn.median(dd)
            # Median absolute deviation
            MAD = bn.median(np.abs(dd-md))
            # Range around median
            thresh = MAD * (z/0.6745)
            # True where outside z-range
            res = (d<(md-thresh)) | (d>(md+thresh))
        except:
            dd = d.compressed()
            md = np.median(dd)
            # Median absolute deviation
            MAD = np.median(np.abs(dd-md))
            # Range around median
            thresh = MAD * (z/0.6745)
            # True where outside z-range
            res = (d<(md-thresh)) | (d>(md+thresh))
    elif d.ndim == 2:
        try:
            import bottleneck as bn
            res = np.empty(d.shape, dtype=np.bool)
            for i in range(d.shape[1]):
                di = d[:,i]
                dd = di.compressed()
                md = bn.median(dd)
                # Median absolute deviation
                MAD = bn.median(np.abs(dd-md))
                # Range around median
                thresh = MAD * (z/0.6745)
                # True where outside z-range
                res[:,i] = (d[:,i]<(md-thresh)) | (d[:,i]>(md+thresh))
        except:
            res = np.empty(d.shape, dtype=np.bool)
            for i in range(d.shape[1]):
                di = d[:,i]
                dd = di.compressed()
                md = np.median(dd)
                # Median absolute deviation
                MAD = np.median(np.abs(dd-md))
                # Range around median
                thresh = MAD * (z/0.6745)
                # True where outside z-range
                res[:,i] = (d[:,i]<(md-thresh)) | (d[:,i]>(md+thresh))
    else:
        np.seterr(**oldsettings)
        raise ValueError('datin.ndim must be <= 2')

    np.seterr(**oldsettings)
    if ismasked:
        return res
    else:
        resmasked = type(res) == np.ma.core.MaskedArray
        if resmasked: # got masked because of NaNs
            return np.where(res.mask, False, res)
        else:
            return res
Ejemplo n.º 23
0
import timeit

setup = '''
import numpy as np
import bottleneck as bn
from scipy.stats import rankdata

np.random.seed(42)
a = np.random.randn(30)
'''


def time(code, setup, n):
    return timeit.Timer(code, setup=setup).repeat(3, n)


if __name__ == '__main__':
    n = 10**3
    print n, "pass", max(time("pass", "", n))
    print n, "min np.median", min(time('np.median(a)', setup, n))
    print n, "min bn.median", min(time('bn.median(a)', setup, n))
    a = np.arange(7)
    print "Median diff", np.median(a) - bn.median(a)
    func, _ = bn.func.median_selector(a, axis=0)
    print "Bottleneck median func name", func

    print n, "min scipy.stats.rankdata", min(time('rankdata(a)', setup, n))
    print n, "min bn.rankdata", min(time('bn.rankdata(a)', setup, n))
    func, _ = bn.func.rankdata_selector(a, axis=0)
    print "Bottleneck rankdata func name", func
Ejemplo n.º 24
0
def main(Data,GC_hist,args):
    #compute the scaled coverage
    print("finished reading the coverage data")
    bin_size=Data["bin_size"]
    args.min_bins=int(args.nbins/2)


    if not args.min_bins:
        print "Error: the minimum variant size is smaller than the bin sie of the input data!"
        quit()
			
    for chromosome in Data["chromosomes"]:
        Data[chromosome]["ratio"]=[]
        for i in range(0,len(Data[chromosome]["coverage"])):

           if not Data[chromosome]["GC"][i] in GC_hist:
                Data[chromosome]["ratio"].append(-1)
           elif GC_hist[Data[chromosome]["GC"][i]][0] > 0 and not Data[chromosome]["GC"][i]== -1:
                    if Data[chromosome]["coverage"][i]/GC_hist[Data[chromosome]["GC"][i]][0] < args.max:
                        Data[chromosome]["ratio"].append(Data[chromosome]["coverage"][i]/GC_hist[Data[chromosome]["GC"][i]][0])
                    else:
                        Data[chromosome]["ratio"].append(-1)                        
           else:
                Data[chromosome]["ratio"].append(-1)
        Data[chromosome]["ratio"]=numpy.array(Data[chromosome]["ratio"])
                
    Data=calibrate_sex(Data)
    #filter the bins
    print("applying filters")
    Data=filter(Data,args.nbins*2)
    print("computing coverage histogram")
    ratio_hist=chromosome_hist(Data,args.Q)

    hist=coverage_hist(Data,ratio_hist)
    percentiles=numpy.percentile(hist,numpy.array(range(0,1001))/10.0)
    overall_sd=numpy.std(hist[ numpy.where(hist <= 2) ])
    print("derivative based segmentation")
    for chromosome in Data["chromosomes"]:
        Data[chromosome]["var"]=numpy.repeat( "NEUTRAL",len(Data[chromosome]["ratio"]) );
        Data[chromosome]["ratio"]=numpy.array(Data[chromosome]["ratio"])

        ratio_indexes=[]
        ratios=[]
        for i in range(1,len(Data[chromosome]["ratio"])):
            if Data[chromosome]["ratio"][i] >= 0:
                ratio_indexes.append(i)
                ratios.append(Data[chromosome]["ratio"][i])
        differences=[]
        for i in range(1,args.nbins+1):
            tmp=[]
            for j in range(0,len(ratios)-args.nbins):
                tmp.append( abs(ratios[j]-ratios[i+j]))
            differences.append(tmp)
        differences=numpy.array(differences)

        change_points=[]
        #print len(ratios)

        lim=2*overall_sd
        #lim=0.2
        for i in range(0,len(ratios)-args.nbins):
            changes=differences[:,i]
            #print "{} {}".format(lim,numpy.min(changes))
            if bottleneck.median(changes,axis=0) > lim and numpy.std(changes[1:]) < overall_sd:
                #print "{} {}".format(lim,numpy.min(changes))
                change_points.append(ratio_indexes[i])

        segments=[]
        change_points.append( len( Data[chromosome]["ratio"] ) )        
        for i in range(0,len(change_points)):
            if i == 0:
                segments.append(range(0,change_points[i]))
            elif i != len(change_points)-1:
                segments.append(range(change_points[i-1],change_points[i]))
            else:
                segments.append(range(change_points[i-1],len(Data[chromosome]["ratio"])))

        for segment in segments:
            segment_intensities= Data[chromosome]["ratio"][segment]
            non_filt_bins=segment_intensities[numpy.where(segment_intensities >= 0)]
            TYPE="NEUTRAL"
            med=bottleneck.median(non_filt_bins,axis=0)

            if len(non_filt_bins) < args.min_bins: 
                TYPE="FILT"
            elif med <= 1-0.5/args.plody:
                TYPE="DEL"
            elif med >= 1+0.5/args.plody:
                TYPE="DUP"             
            Data[chromosome]["var"][segment]=TYPE


    print("raw coverage segmentation")
    for chromosome in Data["chromosomes"]:
        for i in range(0,len(Data[chromosome]["ratio"])-10*args.nbins):
            seg_bins=Data[chromosome]["ratio"][i:i+10*args.nbins]

            if list(seg_bins).count(-1)/float(len(seg_bins)) >= 0.6:
                continue

            seg_bin_median=bottleneck.median(seg_bins[numpy.where(seg_bins >= 0)],axis=0)

            if seg_bin_median >= 1+overall_sd*2.5 and len(seg_bins[numpy.where(seg_bins >= 0.5/args.plody+1)])/float(len(seg_bins)) >= 0.9:
                Data[chromosome]["var"][i:i+10*args.nbins]="DUP"
            elif seg_bin_median <= 1-overall_sd*2.5 and len(seg_bins[numpy.where(seg_bins >= 1-0.5/args.plody)])/float(len(seg_bins)) >= 0.9:
                Data[chromosome]["var"][i:i+10*args.nbins]="DEL"

    print("merging")
    variants=segmentation(Data,args.min_bins)
    
    size_filtered_variants={}   
    for chromosome in variants:
        for variant in variants[chromosome]:
            if variant["bins"] >= args.min_bins:
                if not chromosome in size_filtered_variants:
                    size_filtered_variants[chromosome] = []
                size_filtered_variants[chromosome].append(variant)
  
    variants=merge(size_filtered_variants,args.min_bins)
    
    CNV_filtered={}
    for chromosome in variants:
        for variant in variants[chromosome]:
            if variant["type"] == "DUP" or variant["type"] == "DEL":
                if not chromosome in CNV_filtered:
                    CNV_filtered[chromosome] = []
                CNV_filtered[chromosome].append(variant)    
    
    #read the bam header
    args.contigs={}
    args.contig_order=[]
    if args.bam:
        with os.popen("samtools view -H {}".format(args.bam)) as pipe:
            for line in pipe:
                if line[0] == "@":
                    if "SN:" in line:
                        content=line.strip().split()
                        chromosome=content[1].split("SN:")[-1]
                        length=content[2].split("LN:")[-1]
                        args.contigs[chromosome]=length
                        args.contig_order.append(chromosome)
                    elif "\tSM:" in line and not args.sample:
                        args.sample=line.split("\tSM:")[-1].split("\t")[0].strip()

    #print the variants
    print("computing statistics")

    vals=[]
    counts={}
    n_variants=0
    for chromosome in Data["chromosomes"]:
        if chromosome in variants:
          for variant in variants[chromosome]:
            if variant["type"] == "DUP" or variant["type"] == "DEL" or 1 == 2:
                phred_non_param=retrieve_phred_non_param(variant["bins"],variant["ratio"],Data,ratio_hist)
                if not phred_non_param in counts:
                    vals.append(phred_non_param)
                    counts[phred_non_param]=0
                counts[phred_non_param]+=1
                variant["pred_non_param"]=phred_non_param
                n_variants+=1
    if n_variants:       
        args.scoren+=round(10*math.log10(n_variants/1000.0))
    else:
        args.scoren=1
    f=open(args.output,"w")

    f.write("##fileformat=VCFv4.1\n")
    f.write("##source=AMYCNE\n")
    f.write("##ALT=<ID=DEL,Description=\"Deletion>\n")
    f.write("##ALT=<ID=DUP,Description=\"Duplication\">\n")
    f.write("##INFO=<ID=RDR,Number=1,Type=Float,Description=\"Average coverage/reference ratio\">\n")
    f.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"The end position of the variant\">\n")
    f.write("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"The length of the variant\">\n")
    f.write("##INFO=<ID=BINS,Number=1,Type=Integer,Description=\"The number of bins used to call the variant\">\n")
    f.write("##INFO=<ID=SCOREF,Number=1,Type=Integer,Description=\"The variant score produced from Fishers method\">\n")
    f.write("##INFO=<ID=SCOREN,Number=1,Type=Integer,Description=\"The variant score produced from non-parametric sampling method\">\n")
    f.write("##INFO=<ID=QUAL,Number=1,Type=Float,Description=\"The fraction of low quality bins\">\n")
    f.write("##INFO=<ID=FAILED_BINS,Number=1,Type=Float,Description=\"The fraction of filtered bins\">\n")
    f.write("##INFO=<ID=ratio,Number=1,Type=Float,Description=\"Normalised coverage across the chromosome\">\n")
    f.write("##INFO=<ID=ratioMAD,Number=1,Type=Float,Description=\"normalised Median absolute deviation across the chromosome\">\n")
    f.write("##INFO=<ID=coverage,Number=1,Type=Float,Description=\"Median coverage of the chromosome\">\n")
    f.write("##INFO=<ID=coverageMAD,Number=1,Type=Float,Description=\"Median absolute deviation of the coverage across the chromosome\">\n")
    if args.contig_order:
        for contig in args.contig_order:
            f.write("##contig=<ID={},length={}>\n".format(contig,args.contigs[contig]))
    f.write("##FILTER=<ID=LowBinQual,Description=\"More than 90% of the bins have less than {} mapping quality\">\n".format(args.Q))
    f.write("##FILTER=<ID=RegionFilter,Description=\"More than 90% of the bins are flagged extremed GC and/or mapping quality\">\n")
    f.write("##FILTER=<ID=RatioFilter,Description=\"The RD ratio is less than 2 sd of the RD, or RDR higher than ratiolim\">\n")
    f.write("##FILTER=<ID=LowScore,Description=\"Low variant score\">\n")
    f.write("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">\n")
    f.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    f.write("##nbins={} RDstdev={} ScoreNLimit={}\n".format(args.nbins,overall_sd,args.scoren))
    f.write("##AMYCNEcmd=\"{}\"\n".format(" ".join(sys.argv)))
    if not args.sample:
        args.sample=args.coverage.split("/")[-1].split(".")[0]
    f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(args.sample))
    format_column="GT:CN"
    id_tag=0;
    for chromosome in Data["chromosomes"]:
        if chromosome in variants:
          for variant in variants[chromosome]:
            if variant["type"] == "DUP" or variant["type"] == "DEL" or 1 == 2:
                id_tag +=1
                filt="PASS"
                info_field="END={};SVLEN={};RDR={};BINS={}".format(bin_size*variant["end"],(variant["end"]-variant["start"]+1)*bin_size,variant["ratio"],variant["bins"] )
                CN=int(round(variant["ratio"]*args.plody))
                if "quality" in Data[chromosome]:
                    failed_bins=0
                    for i in range(variant["start"],variant["end"]):
                        if Data[chromosome]["quality"][i] < args.Q and Data[chromosome]["GC"][i] > 0 and Data[chromosome]["ratio"][i] > 0:
                            failed_bins += 1
                    if failed_bins/float(variant["end"]-variant["start"]) > 0.9:
                        filt="LowBinQual"
                    info_field +=";QUAL={}".format( failed_bins/float(variant["end"]-variant["start"]) )

                phred=retrieve_phred(variant["ratio_list"],variant["ratio"],percentiles)
                phred_non_param=variant["pred_non_param"]
                info_field+=";SCOREF={};SCOREN={}".format(phred,phred_non_param)
                #info_field+=";SCOREF={}".format(phred)

                if phred < args.scoref or phred_non_param < args.scoren:
                    filt="LowScore"                      
                
                failed_bins=0
                for i in range(variant["start"],variant["end"]):
                    if Data[chromosome]["ratio"][i] < 0:
                        failed_bins += 1
                if failed_bins/float(variant["end"]-variant["start"]) > 0.9:
                    filt="RegionFilter"
                if abs(variant["ratio"]-1) <= overall_sd*2 or abs(variant["ratio"]) > args.ratioLim:
                    filt="RatioFilter"

                info_field +=";FAILED_BINS={}".format( failed_bins/float(variant["end"]-variant["start"]) )                
                
                mean=numpy.average(variant["ratio_list"])
                SEM=numpy.std(variant["ratio_list"])/numpy.sqrt( len(variant["ratio_list"]) )
                ci="({},{})".format(round(mean-SEM*3,2),round(mean+SEM*3,2))

                firstrow = "{}\t{}\tAMYCNE_{}\tN\t<{}>\t{}\t{}".format(chromosome,bin_size*variant["start"],id_tag,variant["type"],phred_non_param,filt)
                info_field+=";ratio={};ratioMAD={};coverage={};coverageMAD={}".format(ratio_hist[chromosome][0],ratio_hist[chromosome][1],ratio_hist[chromosome][2],ratio_hist[chromosome][3])
                alt=abs((CN-args.plody))
                if alt > args.plody:
                    alt=args.plody
                ref=args.plody-alt
                genotype="/".join(["0"]*ref+["1"]*alt)
                format_field="{}\t{}:{}".format(format_column,genotype,CN)
                f.write("\t".join([firstrow,info_field,format_field])+"\n")
                
    f.close()
Ejemplo n.º 25
0
 def time_median(self, dtype, shape):
     bn.median(self.arr)
Ejemplo n.º 26
0
def medestnoise(x):
    return bn.median(np.abs(x)) * complex_std_est_factor
Ejemplo n.º 27
0
def medestnoise(x):
    return bn.median(np.abs(x))*complex_std_est_factor
Ejemplo n.º 28
0
def freqextr(lightcurve,
             n_peaks=6,
             n_harmonics=0,
             hifac=1,
             ofac=4,
             snrlim=None,
             snr_width=None,
             faplim=1 - 0.9973,
             devlim=0.5,
             conseclim=10,
             harmonics_list=None,
             Noptimize=10,
             optim_max_diff=10,
             initps=None):
    r"""
	Extract frequencies from timeseries.

	The program will perform iterative sine-wave fitting (CLEAN or pre-whitening) using
	a sum of harmonic functions of the following form:

	.. math::
		\sum_{i=1}^{N_\mathrm{peaks}} A_i \sin(2\pi\nu_i t + \delta_i)
		= \sum_{i=1}^{N_\mathrm{peaks}} \alpha_i\sin(2\pi\nu_i t) + \beta_i\cos(2\pi\nu_i t) \, ,

	where :math:`\nu_i`, :math:`A_i` and :math:`\delta_i` denoted the frequency, amplitude
	and phase of the oscillation.

	If ``n_harmonic`` is greater than zero, the routine will additionally for each extracted peak
	extract peaks at the given number of harmonics for each peak. Default is to :math:`2\nu_i`,
	:math:`3\nu_i` etc., but this can be controlled by the ``harmonics_list`` input.

	At each iteration, an optimization loop is entered which will go back and re-optimize the
	previously found peaks in an attempt at minimizing influences between close frequencies.
	The details of this optimization can be controlled by the parameters ``Noptimize`` and
	``optim_max_diff``.

	Parameters:
		lightcurve (:class:`lightkurve.LightCurve`): Lightcurve to extract frequencies for.
		n_peaks (int, optional): Number of frequencies to extract.
		n_harmonics (int, optional): Number of harmonics to extract for each frequency.
		hifac (float, optional): Nyquist factor.
		ofac (int, optional): Oversampling factor used for initial search for peaks
			in power spectrum.
		snrlim (float, optional): Limit on local signal-to-noise ratio above which peaks are
			considered significant. If set to `None` no limit is enforced. Default is to not
			enforce a limit.
		snr_width (float, optional): Width in uHz around each peak to estimate signal-to-noise from.
			Default is 15 frequency steps on either side of the peak.
		faplim (float, optional): False alarm probability limit. Peaks with a f.a.p. below this
			limit are considerd significant. If set to `None` no limit is enforced.
			Default is 1-0.9973=0.0027.
		devlim (float, optional): If set to `None` no limit is enforced. Default is 50%.
		conseclim (int, optional): Stop after this number of consecutive failed peaks.
			Default is 10.
		Noptimize (int, optional): At each iteration re-optimize. If put to -1, all peaks will
			be optimized at each iteration. Default is 10.
		optim_max_diff (float, optional): Maximal difference in uHz between frequencies to be
			optimized. Any frequencies futher away than this value from the extracted peak
			will not be optimized in that iteration. If set to ``None`` no limit is enforced.
			Default is 10 uHz. Please note that this does not take the spectral windowfunction
			into account, so this value may have to	be increased in cases where the windowfunction
			has significant side-lobes.
		initps (:class:`powerspectrum`, optional): Initial powerspectrum. Should be a powerspectrum
			calculated from the provided lightcurve. This can be provided if the powerspectrum
			has already been calculated. If not provided, it is calculated from the provided
			lightcurve.

	Returns:
		:class:`astropy.table.Table`: Table of extracted oscillations.

	Note:
		If the height of the peak of one of the harmonics are close to being insignificant,
		the harmonic may not be found as an harmonic, but will be found later as a peak in it self.

	.. codeauthor:: Kristine Kousholt Mikkelsen <*****@*****.**>
	.. codeauthor:: Rasmus Handberg <*****@*****.**>
	"""

    logger = logging.getLogger(__name__)

    # Default value for different parameters
    # TODO: Add these as inputs
    estimate_noise = True

    if initps is not None and not isinstance(initps, powerspectrum):
        raise ValueError("Initial powerspectrum is invalid")

    if Noptimize is None:
        Noptimize = 0

    # If no list of harmonics is given, do the simple one:
    if harmonics_list is None:
        harmonics_list = np.arange(2, n_harmonics + 2)
    elif len(harmonics_list) < n_harmonics:
        raise ValueError("List of harmonics is too short")

    # Constants:
    power_median_to_mean = (1 - 1 / 9)**-3
    mean_noise = 1

    # Store original lightcurve and powerspectrum for later use:
    original_lightcurve = lightcurve.copy()
    if initps is None:
        original_ps = powerspectrum(original_lightcurve)
    else:
        original_ps = initps
    f_max = original_ps.nyquist * hifac * 1e6
    df = original_ps.df * 1e6

    # Defaults that depend on the power spectrum parameters:
    if snr_width is None:
        snr_width = 15 * df

    # Create lists for frequencies, alpha, beta and deviations:
    # Create as 2D array, which as main-frequency number for rows, and harmonic number for columns.
    nu = np.full((n_peaks, n_harmonics + 1), np.nan)
    alpha = np.full((n_peaks, n_harmonics + 1), np.nan)
    beta = np.full((n_peaks, n_harmonics + 1), np.nan)
    deviation = np.full((n_peaks, n_harmonics + 1), np.nan)

    # The first powerspectrum has already been calculated:
    ps = original_ps.copy()

    for i in range(n_peaks):
        logger.debug("-" * 72)

        # Calculate the powerspectrum and find the index of the largest power value
        if i > 0:
            ps = powerspectrum(lightcurve)
        frequency, power = ps.powerspectrum(oversampling=ofac,
                                            nyquist_factor=hifac,
                                            scale='power')

        # Estimate a frequency-dependent noise-floor by binning the power spectrum.
        if estimate_noise:
            # Create bins to estimate noise level in:
            #bins = np.logspace(np.floor(np.log10(df)), np.ceil(np.log10(f_max)), 20)
            bins = np.linspace(df, f_max, 20)

            # Calculate the median in the bins.
            # Make sure we have at least 20 frequencies in each bin,
            # otherwise combine adjacent bins until this is the case:
            for _ in range(100):
                mean_noise, bins, binindx = binned_statistic(frequency,
                                                             power,
                                                             bins=bins,
                                                             statistic=median)

                redo = False
                for k, num in enumerate(np.bincount(binindx)):
                    if num < 20:
                        bins = np.delete(bins, k)
                        redo = True
                        break

                if not redo:
                    break

            bins = bins[:-1] + 0.5 * (bins[1:] - bins[:-1])

            indx = np.isfinite(mean_noise)
            if np.sum(indx) > 2:
                mean_noise_func = interp1d(bins[indx],
                                           mean_noise[indx],
                                           kind='linear',
                                           fill_value='extrapolate',
                                           assume_sorted=True)
                mean_noise = power_median_to_mean * mean_noise_func(frequency)
                mean_noise = np.clip(mean_noise, 0, None)
                mean_noise += 1  # Add one to avoid DivideByZero errors - only used for finding max
            else:
                mean_noise = 1

            #plt.figure()
            #plt.plot(frequency, np.sqrt(power/mean_noise), 'k-', lw=0.5)
            #plt.plot(frequency, power, 'b')
            #plt.plot(frequency, mean_noise,'k-')
            #plt.title(i)
            #plt.show()

        # Finds the frequency of the largest peak:
        pmax_index = np.argmax(power / mean_noise)
        fsearch = frequency[pmax_index]
        if pmax_index > 0 and pmax_index < len(power) - 1:
            fsearch = [
                frequency[pmax_index - 1], fsearch, frequency[pmax_index + 1]
            ]
        nu[i, 0] = ps.optimize_peak(fsearch)
        alpha[i, 0], beta[i, 0] = ps.alpha_beta(nu[i, 0])
        logger.debug('Fundamental frequency: %f', nu[i, 0])

        # Stop if significance becomes too low (lombscargle significance)
        if faplim is not None:
            FAP = ps.false_alarm_probability(nu[i, 0])
            if FAP > faplim:
                logger.debug("Stopped from FAP")
                nu[i, 0] = np.nan
                alpha[i, 0] = np.nan
                beta[i, 0] = np.nan
                break

        # Stop if significance becomes too low (SNR ratio)
        if snrlim is not None:
            # Calculate SNR by estimating noise level locally around peak:
            # TODO: Subtract peak first?
            noise = np.sqrt(
                power_median_to_mean *
                median(power[(frequency > (nu[i, 0] - snr_width))
                             & (frequency < (nu[i, 0] + snr_width))]))
            amp = np.sqrt(alpha[i, 0]**2 + beta[i, 0]**2)
            snr = amp / noise
            logger.debug("SNR: %f", snr)

            #plt.figure()
            #plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5)
            #plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5)
            #plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go')
            #plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude')[1], 'ro')
            #plt.axhline(noise)
            #plt.axvline(nu[i,0] - snr_width)
            #plt.axvline(nu[i,0] + snr_width)

            if snr < snrlim:
                logger.debug("Stopped from SNR")
                nu[i, 0] = np.nan
                alpha[i, 0] = np.nan
                beta[i, 0] = np.nan
                break

        # Check how the extracted peak compares with the original powerspectrum
        if devlim is not None:
            atemp, btemp = original_ps.alpha_beta(nu[i, 0])
            deviation[i, 0] = (alpha[i, 0]**2 + beta[i, 0]**2) / (atemp**2 +
                                                                  btemp**2)

        # Stops if there are to many consecutive failed peaks
        if devlim is not None and conseclim is not None:
            # Stop numpy from warning us that deviation contains NaN
            with np.errstate(invalid='ignore'):
                deviation_large = (deviation > 1 / devlim) | (deviation <
                                                              devlim)
            if np.all(
                    deviation_large[max(i - conseclim, 0):(i + 1),
                                    0]):  # Only checking main peaks right now!
                logger.debug(
                    'Stopped due to too many consecutive failed peaks')
                break

        # Removes the largest peak from the data:
        lightcurve -= model(lightcurve.time, alpha[i, 0], beta[i, 0], nu[i, 0])

        # Loop through all harmonics:
        for h in range(1, n_harmonics + 1):
            n_harmonic = harmonics_list[h - 1]
            # Don't find harmonics outside frequency range:
            if n_harmonic * nu[i, 0] > f_max:
                break

            # Updates the flux and optimize to find the correct frequency
            ps = powerspectrum(lightcurve)

            # Checks the significance of the harmonics. If it is too low NaN is returned in amplitude, frequency and phase for the given harmonic
            nu[i, h] = ps.optimize_peak(n_harmonic * nu[i, 0])

            # Stop if significance becomes too low (lombscargle significance)
            if faplim is not None:
                FAP = ps.false_alarm_probability(nu[i, h])
                logger.debug('harmonic %d: %f %f', h, nu[i, h], FAP)
                if FAP > faplim:
                    logger.debug("Harmonic rejected from FAP")
                    nu[i, h] = np.nan
                    alpha[i, h] = np.nan
                    beta[i, h] = np.nan
                    continue

            # Stop if significance becomes too low (SNR ratio):
            if snrlim is not None:
                # Calculate SNR by estimating noise level locally around peak:
                # TODO: Subtract peak first?
                noise = np.sqrt(
                    power_median_to_mean *
                    median(power[(frequency > (nu[i, 0] - snr_width))
                                 & (frequency < (nu[i, 0] + snr_width))]))
                amp = np.sqrt(alpha[i, 0]**2 + beta[i, 0]**2)
                snr = amp / noise
                logger.debug("SNR: %f", snr)

                #plt.figure()
                #plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5)
                #plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5)
                #plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go')
                #plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude')[1], 'ro')
                #plt.axhline(noise)
                #plt.axvline(nu[i,0] - snr_width)
                #plt.axvline(nu[i,0] + snr_width)

                if snr < snrlim:
                    logger.debug("Stopped from SNR")
                    nu[i, 0] = np.nan
                    alpha[i, 0] = np.nan
                    beta[i, 0] = np.nan
                    break

            # Removes the harmonic peak from the data:
            alpha[i, h], beta[i, h] = ps.alpha_beta(nu[i, h])
            lightcurve -= model(lightcurve.time, alpha[i, h], beta[i, h],
                                nu[i, h])

            # Check how the extracted peak compares with the original powerspectrum and stops
            # if there are to many consecutive failed peaks
            if devlim is not None:
                atemp, btemp = original_ps.alpha_beta(nu[i, h])
                deviation[i, h] = (alpha[i, h]**2 +
                                   beta[i, h]**2) / (atemp**2 + btemp**2)

        # Optimize the Noptimize nearest peaks
        if i != 0 and Noptimize != 0:
            for h in range(n_harmonics + 1):

                # Sort to find nearest frequencies to optimize
                Nopt = Noptimize + 1
                if (i + 1) * (n_harmonics + 1) < Nopt or Noptimize == -1:
                    Nopt = (i + 1) * (n_harmonics + 1)

                nusort = np.abs(nu - nu[i, h])
                nusort = nusort.ravel()
                order = np.argsort(
                    nusort)  # sort nusort and find the list of indexes

                # Create an index of which peaks should be optimized:
                indx_optim = np.zeros_like(order, dtype='bool')
                indx_optim[1:Nopt] = True

                # Only optimize a peak if it is closer than the set limit.
                # NOTE: Be careful as this doesn't take the window function into account
                if optim_max_diff is not None:
                    with np.errstate(invalid='ignore'):
                        indx_optim &= (nusort[order] < optim_max_diff)

                # Pick out the peaks that should be optimized:
                order = order[indx_optim]
                order = list(
                    zip(*np.unravel_index(order, (n_peaks, n_harmonics + 1))))
                logger.debug("Optimizing %d peaks", len(order))

                for j in order:
                    if np.isfinite(
                            alpha[j]
                    ):  # and deviation[j] < 1/devlim and deviation[j] > devlim:
                        # Add the oscillation:
                        lightcurve += model(lightcurve.time, alpha[j], beta[j],
                                            nu[j])
                        ps = powerspectrum(lightcurve)

                        # Find the frequency of maximum power and find alpha and beta again
                        nu[j] = ps.optimize_peak(nu[j])
                        alpha[j], beta[j] = ps.alpha_beta(nu[j])

                        # Recalculate the deviation
                        if devlim is not None:
                            atemp, btemp = original_ps.alpha_beta(nu[j])
                            deviation[j] = (alpha[j]**2 +
                                            beta[j]**2) / (atemp**2 + btemp**2)

                        # Remove the oscillation again:
                        lightcurve -= model(lightcurve.time, alpha[j], beta[j],
                                            nu[j])

    # Remove anything that in the end was marked with a large deviation:
    if devlim is not None:
        for i in range(n_peaks):
            if deviation[i, 0] > 1 / devlim or deviation[i, 0] < devlim:
                # If main peak is rejected, then also reject all harmonics
                nu[i, :] = np.nan
                alpha[i, :] = np.nan
                beta[i, :] = np.nan
            else:
                for j in range(1, n_harmonics + 1):
                    if deviation[i, j] > 1 / devlim or deviation[i,
                                                                 j] < devlim:
                        nu[i, j] = np.nan
                        alpha[i, j] = np.nan
                        beta[i, j] = np.nan

    # Calculate amplitude and phase from alpha and beta:
    amp = np.sqrt(alpha**2 + beta**2)
    phase = np.arctan2(beta, alpha)

    # Make sure the found peaks are ordered by the amplitude of the main peak:
    amp[np.isnan(amp)] = -np.inf
    indx = np.argsort(amp[:, 0])[::-1]
    nu = nu[indx, :]
    amp = amp[indx, :]
    phase = phase[indx, :]
    alpha = alpha[indx, :]
    beta = beta[indx, :]
    deviation = deviation[indx, :]
    amp[~np.isfinite(amp)] = np.nan

    # Gather into table:
    num, harmonic = np.meshgrid(range(1, n_peaks + 1), range(n_harmonics + 1))
    tab = Table(data=[
        num.flatten(order='F'),
        harmonic.flatten(order='F'),
        nu.flatten(),
        amp.flatten(),
        phase.flatten(),
        alpha.flatten(),
        beta.flatten(),
        deviation.flatten()
    ],
                names=[
                    'num', 'harmonic', 'frequency', 'amplitude', 'phase',
                    'alpha', 'beta', 'deviation'
                ],
                dtype=[
                    'int32', 'int32', 'float64', 'float64', 'float64',
                    'float64', 'float64', 'float64'
                ])

    # Add units to columns:
    tab['frequency'].unit = u.uHz
    tab['amplitude'].unit = lightcurve.flux_unit
    tab['phase'].unit = u.rad
    tab['alpha'].unit = lightcurve.flux_unit
    tab['beta'].unit = lightcurve.flux_unit

    # Add index to peak number and harmonic for easy lookup:
    # TODO: Use table indicies - Problem with Pickle
    #tab.add_index('num')

    # Add meta data to table on how the list was created:
    tab.meta['n_peaks'] = n_peaks
    tab.meta['n_harmonics'] = n_harmonics
    tab.meta['harmonics_list'] = harmonics_list
    tab.meta['hifac'] = hifac
    tab.meta['ofac'] = ofac
    tab.meta['snrlim'] = snrlim
    tab.meta['snr_width'] = snr_width * u.uHz
    tab.meta['faplim'] = faplim
    tab.meta['devlim'] = devlim
    tab.meta['conseclim'] = conseclim

    return tab
Ejemplo n.º 29
0
def segmentation(Data,minimum_bin):
    variants={}
    
    for chromosome in Data["chromosomes"]:
        start_pos=-1;
        end_pos=-1;
        variant_type=None
        past_variant_type=-1
        for i in range(0,len(Data[chromosome]["var"])):
            variant_type=Data[chromosome]["var"][i]
            
            if past_variant_type == -1:
                start_pos=i
                end_pos = i+1
                past_variant_type=variant_type
            elif past_variant_type == variant_type:
                end_pos +=1
            else:
                if not chromosome in variants:
                    variants[chromosome] = []
                ratio_list=Data[chromosome]["ratio"][start_pos:end_pos+1]
                ratio_list=ratio_list[numpy.where(ratio_list >= 0)]

                variants[chromosome].append({"start":start_pos,"end":end_pos,"type":past_variant_type,"ratio":bottleneck.median(ratio_list),"bins":end_pos-start_pos,"ratio_list":list(ratio_list)})
                
                ratio_list=[]
                past_variant_type=variant_type       
                start_pos=i
                end_pos=start_pos+1
                
    return(variants)
Ejemplo n.º 30
0
 def time_median(self, dtype, shape, order, axis):
     bn.median(self.arr, axis=axis)
Ejemplo n.º 31
0
def random_epstein_exact_weird_correction(values_input, nb_to_keep):
    nb_to_remove = len(values_input) - nb_to_keep
    k = nb_to_remove

    values = values_input[:]
    S = set(range(len(values)))
    n = len(S)
    indice_from_which_to_start = len(values)
    Aleft = weighted_average_weird_correction(values)

    Aright = float('inf')  #max([x[0] for x in values])#float('inf')

    while len(S) > 1:
        sampled = sample(S, 1)[0]
        vi = values[sampled][0]
        wi = values[sampled][1]

        X, Y, Z, E = COMPUTE_X_Y_Z_E(S, values, Aleft, Aright, vi, wi)

        while True:
            #print 'HeLLo'
            if len(Z) > 0:

                A = median([(vi - values[j][0]) / (wi - values[j][1])
                            for j in Z])
                #A=[(vi-values[j][0])/(wi-values[j][1]) for j in Z][len(Z)/2]
                #print A

                #print [(vi-values[j][0])/(wi-values[j][1]) for j in Z],A
                l2 = partsort(([A * values[j][1] - values[j][0] for j in S]),
                              len(S) - k)[:len(S) - k]
                F_A = -sum(l2)

                if F_A == 0:

                    return A

                elif F_A > 0:
                    Aleft = A
                else:
                    Aright = A
                X, Y, Z = UPDATE_X_Y_Z(S, values, Aleft, Aright, vi, wi, X, Y,
                                       Z)

            if ((len(X) + len(E)) >= (len(S) - k)) and k > 0:
                nb_to_remove = min(len(E), len(X) + len(E) - (len(S) - k))
                to_remove_E = set(sample(E, nb_to_remove))
                S = S - to_remove_E
                E = E - to_remove_E
                S = S - Y
                #k=k-(len(Y)+nb_to_remove)
                k = k - (len(Y) + nb_to_remove)
                Y = set()

            elif (len(Y) + len(E)) >= k:
                nb_to_collapse = min(len(E), len(Y) + len(E) - k)
                values_to_collapse_E = set(sample(E, nb_to_collapse))
                E = E - values_to_collapse_E
                values_to_collapse = values_to_collapse_E | X
                S = S - values_to_collapse

                collapsed_v = 0.
                collapsed_w = 1.

                for x in values_to_collapse:
                    vx, wx = values[x]
                    collapsed_v += vx
                    collapsed_w += wx

                collapsed = (collapsed_v, collapsed_w)
                values.append(collapsed)
                X = {indice_from_which_to_start}
                S.add(indice_from_which_to_start)
                indice_from_which_to_start += 1

            # if len(Z)<=len(S)/32:
            # 	break
            if len(Z) <= len(S) / 32.:
                break

    spop = S.pop()
    #print values[spop]

    return values[spop][0] / values[spop][1]
Ejemplo n.º 32
0
def analyze_datafile(file, sensitivity=1e-1):

    with open(file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        median_diffs = None
        median_diffs_idx = []
        lineno = 1
        allfloats = []
        for row in reader:
            floats = []
            for i in range(len(row)):
                try:
                    f = float(row[i])
                    floats.append(f)
                    # only on first run
                    if not median_diffs:
                        median_diffs_idx.append(i)
                except ValueError:
                    pass
            # most likely a header line
            if not floats:
                continue
            allfloats.append(floats)
            if not median_diffs:
                median_diffs = [[] for _ in range(len(floats))]
                logging.debug("Possible price indices: {}".format(median_diffs_idx))
            if len(floats) != len(median_diffs):
                logging.error("Line {} in file {} is exceptional, skipped.".format(lineno, file))
                continue
            fmedian = bn.median(floats)
            # logging.debug("[{}] median: {}".format(lineno, fmedian))
            for i in range(0, len(floats)):
                diffnow = (floats[i] - fmedian) / fmedian
                median_diffs[i].append(diffnow)
                # logging.debug("[{}][{}] diff: {}".format(lineno, i, diffnow))
            lineno += 1

            # we don't need more data than this (faster runtime)
            if lineno == 10001:
                break

    mean_median_diffs = np.abs(np.mean(median_diffs, axis=1))
    logging.debug("Mean median diffs: {}".format(mean_median_diffs))

    median_diffs_idx = np.array(median_diffs_idx)
    mean_median_diffs = np.array(mean_median_diffs)

    pricecols = median_diffs_idx[mean_median_diffs < sensitivity]

    numdecimals = 0

    allfloats = np.asarray(allfloats)
    for f in allfloats.flat:
        splitted = str(f).split(".")
        if len(splitted) == 2:
            numdecimals = max(numdecimals, len(splitted[1]))

    # maximum number of decimals is limited
    numdecimals = min(numdecimals, 10)

    return pricecols, numdecimals
Ejemplo n.º 33
0
import bottleneck as bn
import numpy as np
import timeit


setup = '''
import numpy as np
import bottleneck as bn
from scipy.stats import rankdata

np.random.seed(42)
a = np.random.randn(30)
'''
def time(code, setup, n):
    return timeit.Timer(code, setup=setup).repeat(3, n)

if __name__ == '__main__':
    n = 10**3
    print n, "pass", max(time("pass", "", n))
    print n, "min np.median", min(time('np.median(a)', setup, n))
    print n, "min bn.median", min(time('bn.median(a)', setup, n))
    a = np.arange(7)
    print "Median diff", np.median(a) - bn.median(a)
    func, _ = bn.func.median_selector(a, axis=0)
    print "Bottleneck median func name", func

    print n, "min scipy.stats.rankdata", min(time('rankdata(a)', setup, n))
    print n, "min bn.rankdata", min(time('bn.rankdata(a)', setup, n))
    func, _ = bn.func.rankdata_selector(a, axis=0)
    print "Bottleneck rankdata func name", func
Ejemplo n.º 34
0
def sequence_multi_probability_to_represent_element_mutil_probability(sequence_mutil_probability):
    represent_element_mutil_probability = bn.median(sequence_mutil_probability, axis=0)
    return represent_element_mutil_probability
Ejemplo n.º 35
0
def mad(arr):
    #copied from:https://stackoverflow.com/questions/8930370/where-can-i-find-mad-mean-absolute-deviation-in-scipy
    arr = numpy.ma.array(arr).compressed() # should be faster to not use masked arrays.
    med = bottleneck.median(arr)
    return bottleneck.median(numpy.abs(arr - med))
Ejemplo n.º 36
0
def mad(datin, z=7, deriv=0, nozero=False):
    """
    Median absolute deviation test, either on raw values,
    or on 1st or 2nd derivatives.

    Returns mask with False everywhere except where `<(median-MAD\*z/0.6745)`
    or `>(md+MAD\*z/0.6745)`.

    Parameters
    ----------
    datin : array or masked array
        `mad` acts on `axis=0`.
    z : float, optional
        Input is allowed to deviate maximum `z` standard deviations from the median (default: 7)
    deriv : int, optional
        0: Act on raw input (default).

        1: Use first derivatives.

        2: Use 2nd derivatives.
    nozero : bool, optional
        True: exclude zeros (0.) from input `datin`.

    Returns
    -------
    array of bool
        False everywhere except where input deviates more than `z` standard deviations from median

    Notes
    -----
    If input is an array then mad is checked along the zeroth axis for outlier.

    1st derivative is calculated as `d = datin[1:n]-datin[0:n-1]`
    because mean of left and right would give 0 for spikes.

    If `all(d.mask==True)` then return `d.mask`, which is all True.

    Examples
    --------
    >>> import numpy as np
    >>> y = np.array([-0.25,0.68,0.94,1.15,2.26,2.35,2.37,2.40,2.47,2.54,2.62,
    ...               2.64,2.90,2.92,2.92,2.93,3.21,3.26,3.30,3.59,3.68,4.30,
    ...               4.64,5.34,5.42,8.01],dtype=np.float)

    >>> # Normal MAD
    >>> print(mad(y))
    [False False False False False False False False False False False False
     False False False False False False False False False False False False
     False False]

    >>> print(mad(y,z=4))
    [False False False False False False False False False False False False
     False False False False False False False False False False False False
     False  True]

    >>> print(mad(y,z=3))
    [ True False False False False False False False False False False False
     False False False False False False False False False False False False
      True  True]

    >>> # MAD on 2nd derivatives
    >>> print(mad(y,z=4,deriv=2))
    [False False False False False False False False False False False False
     False False False False False False False False False False False  True]

    >>> # direct usage
    >>> my = np.ma.array(y, mask=mad(y,z=4))
    >>> print(my)
    [-0.25 0.68 0.94 1.15 2.26 2.35 2.37 2.4 2.47 2.54 2.62 2.64 2.9 2.92 2.92
     2.93 3.21 3.26 3.3 3.59 3.68 4.3 4.64 5.34 5.42 --]

    >>> # MAD on several dimensions
    >>> yy = np.transpose(np.array([y,y]))
    >>> print(np.transpose(mad(yy,z=4)))
    [[False False False False False False False False False False False False
      False False False False False False False False False False False False
      False  True]
     [False False False False False False False False False False False False
      False False False False False False False False False False False False
      False  True]]

    >>> yyy = np.transpose(np.array([y,y,y]))
    >>> print(np.transpose(mad(yyy,z=3)))
    [[ True False False False False False False False False False False False
      False False False False False False False False False False False False
       True  True]
     [ True False False False False False False False False False False False
      False False False False False False False False False False False False
       True  True]
     [ True False False False False False False False False False False False
      False False False False False False False False False False False False
       True  True]]

    >>> # Masked arrays
    >>> my = np.ma.array(y, mask=np.zeros(y.shape))
    >>> my.mask[-1] = True
    >>> print(mad(my,z=4))
    [True False False False False False False False False False False False
     False False False False False False False False False False False False
     False --]

    >>> print(mad(my,z=3))
    [True False False False False False False False False False False False
     False False False False False False False False False False False True
     True --]

    >>> # Arrays with NaNs
    >>> ny = y.copy()
    >>> ny[-1] = np.nan
    >>> print(mad(ny,z=4))
    [ True False False False False False False False False False False False
     False False False False False False False False False False False False
     False False]

    >>> print(mad(ny,z=3))
    [ True False False False False False False False False False False False
     False False False False False False False False False False False  True
      True False]

    >>> # Exclude zeros
    >>> zy = y.copy()
    >>> zy[1] = 0.
    >>> print(mad(zy,z=3))
    [ True  True False False False False False False False False False False
     False False False False False False False False False False False False
      True  True]

    >>> print(mad(zy,z=3,nozero=True))
    [ True False False False False False False False False False False False
     False False False False False False False False False False False False
      True  True]

    History
    -------
    Written,  Matthias Cuntz, Nov 2011
    Modified, Matthias Cuntz, May 2012 - act on axis=0 of array
              Matthias Cuntz, Jun 2012 - axis=0 did not always work: spread md and MAD to input dimensions
              Matthias Cuntz, Jun 2012 - use np.diff, remove spreads
              Matthias Cuntz, Feb 2013 - ported to Python 3
              Matthias Cuntz & Juliane Mai
                              Jul 2013 - loop over second dimension for medians, faster than array calculations :-(
                                         but use bottleneck for speed :-)
              Matthias Cuntz, Jul 2013 - (re-)allow masked arrays and NaNs in arrays
              Matthias Cuntz, Oct 2013 - nozero, bug in NaN treatment with dim=1
              Matthias Cuntz, May 2020 - numpy docstring format
    """
    if nozero:
        idatin = datin.copy()
        ii = np.where(idatin == 0.)[0]
        if ii.size > 0: idatin[ii] = np.nan
    else:
        idatin = datin
    sn = list(np.shape(idatin))
    n = sn[0]
    if deriv == 0:
        m = n
        d = idatin
    elif deriv == 1:
        m = n - 1
        sm = sn
        sm[0] = m
        d = np.diff(idatin, axis=0)
    elif deriv == 2:
        m = n - 2
        sm = sn
        sm[0] = m
        d = np.diff(idatin, n=2, axis=0)
    else:
        raise ValueError('Unimplemented option.')

    # Shortcut if all masked
    ismasked = isinstance(d, np.ma.core.MaskedArray)
    if not ismasked:
        ii = np.where(~np.isfinite(d))[0]
        d = np.ma.array(d)
        if ii.size > 0: d[ii] = np.ma.masked

    if np.all(d.mask == True):
        if ismasked:
            return d.mask
        else:
            return np.ones(d.shape, dtype=np.bool)

    # Median
    oldsettings = np.geterr()
    np.seterr(invalid='ignore')
    if d.ndim == 1:
        try:
            import bottleneck as bn
            dd = d.compressed()
            md = bn.median(dd)
            # Median absolute deviation
            MAD = bn.median(np.abs(dd - md))
            # Range around median
            thresh = MAD * (z / 0.6745)
            # True where outside z-range
            res = (d < (md - thresh)) | (d > (md + thresh))
        except:
            dd = d.compressed()
            md = np.median(dd)
            # Median absolute deviation
            MAD = np.median(np.abs(dd - md))
            # Range around median
            thresh = MAD * (z / 0.6745)
            # True where outside z-range
            res = (d < (md - thresh)) | (d > (md + thresh))
    elif d.ndim == 2:
        try:
            import bottleneck as bn
            res = np.empty(d.shape, dtype=np.bool)
            for i in range(d.shape[1]):
                di = d[:, i]
                dd = di.compressed()
                md = bn.median(dd)
                # Median absolute deviation
                MAD = bn.median(np.abs(dd - md))
                # Range around median
                thresh = MAD * (z / 0.6745)
                # True where outside z-range
                res[:, i] = (d[:, i] < (md - thresh)) | (d[:, i] >
                                                         (md + thresh))
        except:
            res = np.empty(d.shape, dtype=np.bool)
            for i in range(d.shape[1]):
                di = d[:, i]
                dd = di.compressed()
                md = np.median(dd)
                # Median absolute deviation
                MAD = np.median(np.abs(dd - md))
                # Range around median
                thresh = MAD * (z / 0.6745)
                # True where outside z-range
                res[:, i] = (d[:, i] < (md - thresh)) | (d[:, i] >
                                                         (md + thresh))
    else:
        np.seterr(**oldsettings)
        raise ValueError('datin.ndim must be <= 2')

    np.seterr(**oldsettings)
    if ismasked:
        return res
    else:
        if isinstance(res,
                      np.ma.core.MaskedArray):  # got masked because of NaNs
            return np.where(res.mask, False, res)
        else:
            return res
Ejemplo n.º 37
0
def create_model_ima(model_dir, output_name, z, mu0, FOV, RA0, DEC0):
    # We read only a subset of particle properties (positions, to save memory)
    part = gizmo_read.read.Read.read_snapshot(species=('star', 'gas'),
                                              properties=['position'],
                                              directory=model_dir)

    # And save them to the stars and gas pandas dataframes
    # For indexing in pandas : stars.loc[x,y]
    stars = pd.DataFrame(np.array(part["star"]["position"]))
    gas = pd.DataFrame(np.array(part["gas"]["position"]))

    # We calculate the transformations for kpc to pix as a function of z (Distance)
    kpc_arcsec = cosmo.kpc_proper_per_arcmin(z) / 60.
    arcsec_pix = 0.1  # Euclid VIS pixscale

    axis_obs = np.array([0, 1])
    # Transform from kpc to pix
    print('Transforming coordinates to pixel space')
    stars.loc[:,
              axis_obs[0]] = stars.loc[:,
                                       axis_obs[0]] / (kpc_arcsec * arcsec_pix)
    stars.loc[:,
              axis_obs[1]] = stars.loc[:,
                                       axis_obs[1]] / (kpc_arcsec * arcsec_pix)
    gas.loc[:,
            axis_obs[0]] = gas.loc[:, axis_obs[0]] / (kpc_arcsec * arcsec_pix)
    gas.loc[:,
            axis_obs[1]] = gas.loc[:, axis_obs[1]] / (kpc_arcsec * arcsec_pix)

    # We create the image
    print('Creating stellar image')
    image_stars = np.histogram2d(x=stars.loc[:, axis_obs[0]],
                                 y=stars.loc[:, axis_obs[1]],
                                 bins=FOV,
                                 range=[[-FOV / 2, FOV / 2],
                                        [-FOV / 2, FOV / 2]],
                                 normed=None,
                                 weights=None,
                                 density=None)
    print('Creating gas image')
    image_gas = np.histogram2d(x=gas.loc[:, axis_obs[0]],
                               y=gas.loc[:, axis_obs[1]],
                               bins=FOV,
                               range=[[-FOV / 2, FOV / 2], [-FOV / 2,
                                                            FOV / 2]],
                               normed=None,
                               weights=None,
                               density=None)
    data = image_stars[0] + image_gas[0]

    # Convolve by the Euclid PSF
    psf = fits.open('/localdata/Borlaff/EMDB/kernel.fits')
    print('Convolving image with LARGE PSF')
    data_low = convolve_fft(data, psf[1].data, allow_huge=True)
    psf = fits.open('/localdata/Borlaff/EMDB/psf_VIS_centred.fits')
    print('Convolving image with Euclid VIS PSF')
    data[np.where(data == 1)] = 0
    data_high = convolve_fft(data, psf[1].data, allow_huge=True)
    data = (data_low + data_high) / 2.

    # Photometry
    # What is the mean particle density on the central pixels?
    print('Calibrating photometry')
    skybg = bn.median(data[0:int(FOV / 10), 0:int(FOV / 10)])
    data = data - skybg
    central_density = bn.median(data[int(FOV / 2 - 5):int(FOV / 2 + 5),
                                     int(FOV / 2 - 5):int(FOV / 2 + 5)])
    int0 = (arcsec_pix**2) * 10**(
        (24.445 - mu0) / 2.5)  # Central intensity for the mu0 set by the user
    photometry_correction = int0 / central_density
    data = data * photometry_correction

    # We add a fake centred WCS and save the fits file
    hdu = fits.PrimaryHDU(data=data)

    print('Saving fake WCS')
    hdu.header['WCSAXES'] = 2
    hdu.header['CRPIX1'] = FOV / 2. + 0.5
    hdu.header['CRPIX2'] = FOV / 2. + 0.5
    hdu.header['CRVAL1'] = RA0
    hdu.header['CRVAL2'] = DEC0
    hdu.header['CTYPE1'] = 'RA---TAN'
    hdu.header['CTYPE2'] = 'DEC--TAN'
    hdu.header['RA'] = RA0
    hdu.header['DEC'] = DEC0
    hdu.header['CD1_1'] = 2.521185192875E-05
    hdu.header['CD1_2'] = 1.173845066278E-05
    hdu.header['CD2_1'] = 1.162545338166E-05
    hdu.header['CD2_2'] = -2.537923352533E-05

    print('Saving file: ' + output_name)
    if os.path.exists(output_name):
        os.remove(output_name)
    hdu.verify("silentfix")
    hdu.writeto(output_name)