Example #1
0
def weighted_mean_and_err(tau, ccf, cut_off=0.5):

    ### Limit to range of ccf function ##
    max_ccf = np.max(ccf)
    lim = cut_off * max_ccf
    tau = tau[ccf > lim]
    ccf = ccf[ccf > lim]

    ### find weighted mean and standard error on the mean ###
    tau_mean = ws.numpy_weighted_mean(tau, weights=ccf)
    tau_var = ws.numpy_weighted_mean((tau - tau_mean)**2, weights=ccf)
    tau_std = np.sqrt(tau_var)
    tau_SE = tau_std / len(tau)

    return tau_mean, tau_SE
    ])

    ### Unpack values ###
    ccf = out[:, 0]
    ccf_err = out[:, 1]

    #%% Fit a parabola for those points around the centre of the ccf function ###
    sub_tau = np.arange(-10, 10)
    test_ccf = ccf[np.isin(tau_arr, sub_tau)]
    fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf)
    plot_tau = np.linspace(-5, 6, 30)
    ccf_fit = parabola(plot_tau, *fit_params)
    max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)]

    #%% Find weighted mean and skew of ccf ###
    mean_lag[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf)
    median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf)
    max_lag[n] = sub_tau[np.argmax(test_ccf)]

    #%% Make plots ###
    plt.figure(2, figsize=[10, 10])
    #plt.subplot(211)
    #plt.plot(tau_arr, ccf,'o')
    plt.errorbar(tau_arr,
                 ccf,
                 yerr=ccf_err,
                 fmt='o',
                 color='C' + str(n),
                 label=label)
    #    plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed')
    #    plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted')
Example #3
0
    def run(self, scaffold_stats):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        """

        self.logger.info(
            "Calculating statistics for {:,} genomes over {:,} scaffolds.".
            format(scaffold_stats.num_genomes(),
                   scaffold_stats.num_scaffolds()))

        self.coverage_headers = scaffold_stats.coverage_headers
        self.signature_headers = scaffold_stats.signature_headers

        genome_size = defaultdict(int)
        scaffold_length = defaultdict(list)
        gc = defaultdict(list)
        coverage = defaultdict(list)
        signature = defaultdict(list)
        for _scaffold_id, stats in scaffold_stats.stats.items():
            if stats.genome_id == scaffold_stats.unbinned:
                continue

            genome_size[stats.genome_id] += stats.length
            scaffold_length[stats.genome_id].append(stats.length)
            gc[stats.genome_id].append(stats.gc)
            coverage[stats.genome_id].append(stats.coverage)
            signature[stats.genome_id].append(stats.signature)

        # record statistics for each genome
        genomic_signature = GenomicSignature(0)

        self.genome_stats = {}
        for genome_id in genome_size:
            # calculate weighted mean and median statistics
            weights = np_array(scaffold_length[genome_id])

            len_array = np_array(scaffold_length[genome_id])
            mean_len = ws.numpy_weighted_mean(len_array, weights)
            median_len = ws.numpy_weighted_median(len_array, weights)

            gc_array = np_array(gc[genome_id])
            mean_gc = ws.numpy_weighted_mean(gc_array, weights)
            median_gc = ws.numpy_weighted_median(gc_array, weights)

            cov_array = np_array(coverage[genome_id]).T
            mean_cov = ws.numpy_weighted_mean(cov_array, weights)
            median_cov = []
            for i in range(cov_array.shape[0]):
                median_cov.append(
                    ws.numpy_weighted_median(cov_array[i, :], weights))

            signature_array = np_array(signature[genome_id]).T
            mean_signature = ws.numpy_weighted_mean(signature_array, weights)

            # calculate mean and median tetranucleotide distance
            td = []
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                stats = scaffold_stats.stats[scaffold_id]
                td.append(
                    genomic_signature.manhattan(stats.signature,
                                                mean_signature))

            self.genome_stats[genome_id] = self.GenomeStats(
                genome_size[genome_id], mean_len, median_len, mean_gc,
                median_gc, mean_cov, median_cov, mean_signature, np_mean(td),
                np_median(td))

        return self.genome_stats
Example #4
0
    #%% Fit a parabola for those points around the centre of the ccf function ###
    sub_tau = np.arange(-7, 8)
    test_ccf = ccf[np.isin(tau_arr, sub_tau)]
    test_ccf_err = ccf_err[np.isin(tau_arr, sub_tau)]
    fit_params, pcov = scipy.optimize.curve_fit(parabola,
                                                sub_tau,
                                                test_ccf,
                                                sigma=test_ccf_err)
    plot_tau = np.linspace(-7, 7, 100)
    ccf_fit = parabola(plot_tau, *fit_params)
    max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)]
    max_lag_fit_err[n] = bootstrapping_max(plot_tau, ccf_fit)

    #%% Find mean lag (centroid) in the centre ###
    mean_lag_test[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf)
    mean_lag[n], mean_lag_err[n] = weighted_mean_and_err(tau_arr, ccf)
    median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf)

    #%% Find lag where ccf is max in centre ###
    max_lag[n] = sub_tau[np.argmax(test_ccf)]
    max_ccf[n] = np.nanmax(test_ccf)

    #%% Create and save ccfs if prompted ###
    if save_ccf == True:
        plt.figure(figsize=[8, 8])
        plt.title('DR11 ID: ' + str(bindata['ID']) + ' $\chi^{2}$ = ' +
                  str(bindata['Chi_K']))
        plt.errorbar(tau_arr,
                     ccf,
                     yerr=ccf_err,
Example #5
0
import weightedstats as ws

my_data = [1, 2, 3, 4, 5]
my_weights = [10, 1, 1, 1, 9]

# Ordinary (unweighted) mean and median
print(ws.mean(my_data))  # equivalent to ws.weighted_mean(my_data)
ws.median(my_data)  # equivalent to ws.weighted_median(my_data)

# Weighted mean and median
ws.weighted_mean(my_data, weights=my_weights)
ws.weighted_median(my_data, weights=my_weights)

# Special weighted mean and median functions for use with numpy arrays
ws.numpy_weighted_mean(my_data, weights=my_weights)
ws.numpy_weighted_median(my_data, weights=my_weights)
    #%% Calculate the CCF at various tau values ###

    out = np.array([
        vari_funcs.cross_correlation.cross_correlate(corr_test_k_flux,
                                                     corr_test_j_flux,
                                                     tau,
                                                     type='dcf')
        for tau in tau_arr
    ])

    ### Unpack values ###
    ccf = out[:, 0]
    ccf_err = out[:, 1]

    #%% Find weighted mean and skew of ccf ###
    mean_lag[n] = ws.numpy_weighted_mean(tau_arr, weights=ccf)
    median_lag[n] = ws.numpy_weighted_median(tau_arr, weights=ccf)
    ccf_skew[n] = skew(ccf)

    #%% Make plots ###
    plt.figure(2, figsize=[10, 10])
    #plt.subplot(211)
    #plt.plot(tau_arr, ccf,'o')
    plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', label=label)
    #    plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed')
    #    plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted')
    plt.xlabel('Lag (months)')
    plt.ylabel('Discrete Cross-Correlation Function')
    plt.ylim(-0.5, 0.9)
    plt.grid(True)
    plt.legend(loc='lower center')