Ejemplo n.º 1
0
def identify_centrals(data, mask=None, filename='subhalo_central_flags-v2.fits', rank=0, size=1):
    
    groups = np.unique(data['groupid'])
    Ngrp = len(groups)
    print 'Will process data for %d groups'%Ngrp
    i0=0

    if mask is None:
        mask = np.isfinite(data['pos'].T[0]) & np.isfinite(data['pos'].T[1]) & np.isfinite(data['pos'].T[2])
    flags = np.zeros(data.size, dtype=[('subhalo_id',int),('central1',int),('central2',int),('central3',int)])
    ident = np.arange(0, len(data), 1)


    for i, group in enumerate(groups):

        if i%size!=rank:
            continue

        select = (data['groupid'][mask]==group)
        N = len(data['groupid'][mask][select])
        print i, group, N

        if N<2:
            continue

        M = data['mass'][mask][select]
        xrand = np.random.choice(data['pos'].T[0][mask][select])
        yrand = np.random.choice(data['pos'].T[1][mask][select])
        zrand = np.random.choice(data['pos'].T[2][mask][select])
        import weightedstats as ws
        sane = (abs(data['pos'].T[0][mask][select]-xrand)<0.1e5) & (abs(data['pos'].T[1][mask][select]-yrand)<0.1e5) & (abs(data['pos'].T[2][mask][select]-zrand)<0.1e5) 
        x0 = ws.numpy_weighted_median(data['pos'].T[0][mask][select][np.isfinite(data['pos'].T[0][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[0][mask][select])])
        y0 = ws.numpy_weighted_median(data['pos'].T[1][mask][select][np.isfinite(data['pos'].T[1][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[1][select])])
        z0 = ws.numpy_weighted_median(data['pos'].T[2][mask][select][np.isfinite(data['pos'].T[2][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[2][mask][select])])

        #x0 = np.sum(M*data['pos'].T[0][mask][select])/np.sum(M)
        #y0 = np.sum(M*data['pos'].T[1][mask][select])/np.sum(M)
        #z0 = np.sum(M*data['pos'].T[2][mask][select])/np.sum(M)

        R = np.sqrt((data['pos'].T[0][mask][select]-x0)**2 + (data['pos'].T[1][mask][select]-y0)**2 + (data['pos'].T[2][mask][select]-z0)**2)
        select_cent1 = R==R[np.isfinite(R)].min()
        icent1 = ident[mask][select][select_cent1][0]
        flags['central1'][icent1] = 1

        select_cent2 = M==M[np.isfinite(M)].max()
        icent2 = ident[mask][select][select_cent2][0]
        flags['central2'][icent2] = 1

        Mb = data['massbytype'].T[4][mask][select]
        select_cent3 = Mb==Mb[np.isfinite(Mb)].max()
        icent3 = ident[mask][select][select_cent2][0]
        flags['central3'][icent3] = 1
        #import pdb ; pdb.set_trace()

        i0+=N

    print 'Saving flags', filename
    outfits = fi.FITS(filename.replace('.fits','-%d.fits'%rank), 'rw')
    outfits.write(flags)
    outfits.close()

    return flags
    ### Unpack values ###
    ccf = out[:, 0]
    ccf_err = out[:, 1]

    #%% Fit a parabola for those points around the centre of the ccf function ###
    sub_tau = np.arange(-10, 10)
    test_ccf = ccf[np.isin(tau_arr, sub_tau)]
    fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf)
    plot_tau = np.linspace(-5, 6, 30)
    ccf_fit = parabola(plot_tau, *fit_params)
    max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)]

    #%% Find weighted mean and skew of ccf ###
    mean_lag[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf)
    median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf)
    max_lag[n] = sub_tau[np.argmax(test_ccf)]

    #%% Make plots ###
    plt.figure(2, figsize=[10, 10])
    #plt.subplot(211)
    #plt.plot(tau_arr, ccf,'o')
    plt.errorbar(tau_arr,
                 ccf,
                 yerr=ccf_err,
                 fmt='o',
                 color='C' + str(n),
                 label=label)
    #    plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed')
    #    plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted')
    #    plt.plot(plot_tau, ccf_fit, 'C'+str(n))
Ejemplo n.º 3
0
    def run(self, scaffold_stats):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        """

        self.logger.info(
            "Calculating statistics for {:,} genomes over {:,} scaffolds.".
            format(scaffold_stats.num_genomes(),
                   scaffold_stats.num_scaffolds()))

        self.coverage_headers = scaffold_stats.coverage_headers
        self.signature_headers = scaffold_stats.signature_headers

        genome_size = defaultdict(int)
        scaffold_length = defaultdict(list)
        gc = defaultdict(list)
        coverage = defaultdict(list)
        signature = defaultdict(list)
        for _scaffold_id, stats in scaffold_stats.stats.items():
            if stats.genome_id == scaffold_stats.unbinned:
                continue

            genome_size[stats.genome_id] += stats.length
            scaffold_length[stats.genome_id].append(stats.length)
            gc[stats.genome_id].append(stats.gc)
            coverage[stats.genome_id].append(stats.coverage)
            signature[stats.genome_id].append(stats.signature)

        # record statistics for each genome
        genomic_signature = GenomicSignature(0)

        self.genome_stats = {}
        for genome_id in genome_size:
            # calculate weighted mean and median statistics
            weights = np_array(scaffold_length[genome_id])

            len_array = np_array(scaffold_length[genome_id])
            mean_len = ws.numpy_weighted_mean(len_array, weights)
            median_len = ws.numpy_weighted_median(len_array, weights)

            gc_array = np_array(gc[genome_id])
            mean_gc = ws.numpy_weighted_mean(gc_array, weights)
            median_gc = ws.numpy_weighted_median(gc_array, weights)

            cov_array = np_array(coverage[genome_id]).T
            mean_cov = ws.numpy_weighted_mean(cov_array, weights)
            median_cov = []
            for i in range(cov_array.shape[0]):
                median_cov.append(
                    ws.numpy_weighted_median(cov_array[i, :], weights))

            signature_array = np_array(signature[genome_id]).T
            mean_signature = ws.numpy_weighted_mean(signature_array, weights)

            # calculate mean and median tetranucleotide distance
            td = []
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                stats = scaffold_stats.stats[scaffold_id]
                td.append(
                    genomic_signature.manhattan(stats.signature,
                                                mean_signature))

            self.genome_stats[genome_id] = self.GenomeStats(
                genome_size[genome_id], mean_len, median_len, mean_gc,
                median_gc, mean_cov, median_cov, mean_signature, np_mean(td),
                np_median(td))

        return self.genome_stats
Ejemplo n.º 4
0
 def fix_px_med(self, px_val, data, weights):
     # weighted median. Masked pixels have weight zero.
     try:
         return px_val - ws.numpy_weighted_median(data, weights=weights)
     except:
         return px_val
Ejemplo n.º 5
0
    def eval_deviation(self):
        """
        Calculate the weighted median weighted median deviation from piano key frequencies. :)
        :return:
        """

        # Initialize windowing variables
        hopsize = self.deviation_param_dict["hopsize"]
        windowsize = self.deviation_param_dict["windowsize"]
        windowfunc = self.deviation_param_dict["windowfunc"]
        fftsize = self.deviation_param_dict["fftsize"]
        windowsize_p1 = windowsize + 1
        overlap = windowsize_p1 - hopsize
        window = windowfunc(windowsize)
        energy_constant = windowsize * np.linalg.norm(window)**2.0

        # Read the harmonic signal (TODO: Shouldn't have to do this, should already have harmonic spectrogram available)
        sig = AudioSignal(self.intermediate_harmonic_filename)
        samplerate = sig.samplerate
        max_piano_key_frequency_normalized = self.MAX_PIANO_KEY_FREQUENCY / samplerate

        # Set bounds on reassignment frequency
        rf_upper_bound = min([0.5, max_piano_key_frequency_normalized
                              ])  # 0.5 is Nyquist
        lower_cutoff_freq = self.deviation_param_dict["lower_cutoff_freq"]
        lower_bound_freq = max([lower_cutoff_freq, self.A0_FREQUENCY])
        rf_lower_bound = lower_bound_freq / samplerate

        # Initialize per-frame lists of log-energies and of median deviations from piano key frequencies
        logenergies_per_stft_frame = []
        medians_per_stft_frame = []

        # Set cutoffs for thresholding the log magnitudes and log energies, and epsilon for calculating log
        log_cutoff_freqbin = self.deviation_param_dict[
            "log_cutoff_dB_freqbin"] / 20
        log_cutoff_stft_frame = self.deviation_param_dict[
            "log_cutoff_dB_stft_frame"] / 20
        eps_logmag = self.deviation_param_dict["eps"]

        # Get the number of audio frames, and seek to the the first audio frame (no boundary treatment TODO???)
        frame0 = 0
        num_audio_frames = sig.get_num_frames_from_and_seek_start(
            start_frame=frame0)

        # Now calculate the max number of FULL non-boundary frames you need to compute RF,
        # considering hop size and window size.
        num_full_rf_frames = 1 + (
            (num_audio_frames - windowsize_p1) // hopsize)

        # Convert that to the number of audio frames that you'll analyze for non-boundary RF. (TODO???)
        num_audio_frames_full_rf = (num_full_rf_frames -
                                    1) * hopsize + windowsize_p1

        # Feed blocks to create the non-boundary RF frames  (TODO???)
        blockreader = sig.blocks(blocksize=windowsize_p1,
                                 overlap=overlap,
                                 frames=num_audio_frames_full_rf,
                                 always_2d=True)

        np.set_printoptions(threshold=np.inf)

        for block in blockreader:
            block = block.T  # First transpose to get each channel as a row
            try:
                wft = self._wft(block[:, :windowsize], window,
                                fftsize)  # Calculate windowed fft of signal
            except ValueError:
                print("Current frame at which there is an error: {}".format(
                    frame0))
                raise
            wft_plus = self._wft(
                block[:, 1:], window,
                fftsize)  # Calculate windowed fft of shifted signal
            logabswft = np.log10(np.abs(wft) + eps_logmag)

            # Calculate reassignment frequencies (unit: normalized frequency) and deal with edge cases
            # Threshold the logabswft
            rf = self._calculate_rf(wft, wft_plus)
            in_bounds = np.where((rf >= rf_lower_bound)
                                 & (rf <= rf_upper_bound)
                                 & (logabswft >= log_cutoff_freqbin))
            logabswft = logabswft[in_bounds]
            rf = rf[in_bounds]
            magwftsq = np.power(10., 2 * logabswft)

            # Now calculate the deviations from the nearest piano key and get weights, then append weighted median
            # Note that I do multiply by samplerate/self.A0_FREQUENCY,
            # instead of division by rf_lower_bound, just in case rf_lower_bound gets changed.
            rf_logarithmic = 12 * np.log2(rf * samplerate / self.A0_FREQUENCY)
            nearest_piano_keys = np.round(rf_logarithmic).astype(int)
            deviations = rf_logarithmic - nearest_piano_keys

            if np.size(deviations):
                median = ws.numpy_weighted_median(
                    deviations,
                    weights=magwftsq)  # Mag-squared works, log-mag doesn't
                medians_per_stft_frame.append(median)
                # Now calculate the frame's log energy and append
                logenergy = np.log10((np.linalg.norm(wft)**2.0 /
                                      energy_constant) + eps_logmag)
                logenergies_per_stft_frame.append(logenergy)
            frame0 += hopsize

        # After looping through all blocks, soft threshold log energies and calculate the final weighted median
        # deviation
        logenergies_per_stft_frame = np.asarray(logenergies_per_stft_frame)
        out_of_bounds = np.where(
            logenergies_per_stft_frame < log_cutoff_stft_frame)
        logenergies_per_stft_frame[out_of_bounds] = log_cutoff_stft_frame
        logenergies_per_stft_frame -= log_cutoff_stft_frame  # Necessary to make weights positive
        return ws.numpy_weighted_median(np.asarray(medians_per_stft_frame),
                                        logenergies_per_stft_frame)
Ejemplo n.º 6
0
import weightedstats as ws

my_data = [1, 2, 3, 4, 5]
my_weights = [10, 1, 1, 1, 9]

# Ordinary (unweighted) mean and median
print(ws.mean(my_data))  # equivalent to ws.weighted_mean(my_data)
ws.median(my_data)  # equivalent to ws.weighted_median(my_data)

# Weighted mean and median
ws.weighted_mean(my_data, weights=my_weights)
ws.weighted_median(my_data, weights=my_weights)

# Special weighted mean and median functions for use with numpy arrays
ws.numpy_weighted_mean(my_data, weights=my_weights)
ws.numpy_weighted_median(my_data, weights=my_weights)
    out = np.array([
        vari_funcs.cross_correlation.cross_correlate(corr_test_k_flux,
                                                     corr_test_j_flux,
                                                     tau,
                                                     type='dcf')
        for tau in tau_arr
    ])

    ### Unpack values ###
    ccf = out[:, 0]
    ccf_err = out[:, 1]

    #%% Find weighted mean and skew of ccf ###
    mean_lag[n] = ws.numpy_weighted_mean(tau_arr, weights=ccf)
    median_lag[n] = ws.numpy_weighted_median(tau_arr, weights=ccf)
    ccf_skew[n] = skew(ccf)

    #%% Make plots ###
    plt.figure(2, figsize=[10, 10])
    #plt.subplot(211)
    #plt.plot(tau_arr, ccf,'o')
    plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', label=label)
    #    plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed')
    #    plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted')
    plt.xlabel('Lag (months)')
    plt.ylabel('Discrete Cross-Correlation Function')
    plt.ylim(-0.5, 0.9)
    plt.grid(True)
    plt.legend(loc='lower center')
    plt.tight_layout()
if dez == 'Y' or dez == 'y':
    ### do ccf with deredshifted light curves ####
    z = varydata['z_use']    
    out = np.array([vari_funcs.cross_correlation.cross_correlate_de_z(
            corr_test_k_flux, corr_test_j_flux, tau, z, type='dcf') for tau in tau_arr])
else:
    out = np.array([vari_funcs.cross_correlation.cross_correlate(
            corr_test_k_flux, corr_test_j_flux, tau, type='dcf') for tau in tau_arr])

### Unpack values ###
ccf = out[:,0]
ccf_err = out[:,1]

#%% Find weighted mean and skew of ccf ###
mean_lag, mean_lag_err = weighted_mean_and_err(tau_arr, ccf)
median_lag = ws.numpy_weighted_median(tau_arr, weights=ccf)
ccf_skew = skew(ccf)
max_lag = tau_arr[np.argmax(ccf)]

#%% Fit a parabola for those points around the centre of the ccf function ###
sub_tau = np.arange(-5,6)
test_ccf = ccf[np.isin(tau_arr, sub_tau)]
fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf)
plot_tau = np.linspace(-5,6, 30)
ccf_fit = parabola(plot_tau, *fit_params)
max_lag_fit = plot_tau[np.argmax(ccf_fit)]
    
#%% Make plots ###
plt.figure(2,figsize=[10,10])
#plt.subplot(211)
#plt.plot(tau_arr, ccf,'o')
Ejemplo n.º 9
0
        np.where(humanreadable == sample[i])[0][0]
        for i in np.arange(numSamples)
    ]
    sample_table = (np.hstack(
        (np.reshape(humanreadable[indices],
                    (numSamples, 1)), np.reshape(ev[indices], (numSamples, 1)),
         np.reshape(box[indices], (numSamples, 1)),
         np.reshape(weight[indices] / 100, (numSamples, 1)))))
    return sample_table


print("Mean EV: {}".format(np.mean(ev)))
print("Median EV: {}".format(np.median(ev)))
print("Weighted Average EV: {}".format(np.average(ev, weights=weight)))
print("Weighted Median EV: {}".format(
    ws.numpy_weighted_median(ev, weights=weight)))
joined = sortByEV(ev, weight)
print("Percentage of boxes under the median EV: {}".format(
    np.sum(joined[:9, 1]) / 100))
print("Percentage of boxes with EV over cost: {}".format(
    np.sum(joined[-12:, 1]) / 100))

print("Mean Box: {}".format(np.mean(box)))
print("Median Box: {}".format(np.median(box)))
print("Weighted Average Box: {}".format(np.average(box, weights=weight)))
print("Weighted Median Box: {}".format(
    ws.numpy_weighted_median(box, weights=weight)))
joinedBox = sortByEV(box, weight)
print("Percentage of boxes under the median Cost: {}".format(
    np.sum(joinedBox[:8, 1]) / 100))
print("Percentage of boxes with Cost >= 109.99: {}".format(