def weighted_mean_and_err(tau, ccf, cut_off=0.5): ### Limit to range of ccf function ## max_ccf = np.max(ccf) lim = cut_off * max_ccf tau = tau[ccf > lim] ccf = ccf[ccf > lim] ### find weighted mean and standard error on the mean ### tau_mean = ws.numpy_weighted_mean(tau, weights=ccf) tau_var = ws.numpy_weighted_mean((tau - tau_mean)**2, weights=ccf) tau_std = np.sqrt(tau_var) tau_SE = tau_std / len(tau) return tau_mean, tau_SE
]) ### Unpack values ### ccf = out[:, 0] ccf_err = out[:, 1] #%% Fit a parabola for those points around the centre of the ccf function ### sub_tau = np.arange(-10, 10) test_ccf = ccf[np.isin(tau_arr, sub_tau)] fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf) plot_tau = np.linspace(-5, 6, 30) ccf_fit = parabola(plot_tau, *fit_params) max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)] #%% Find weighted mean and skew of ccf ### mean_lag[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf) median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf) max_lag[n] = sub_tau[np.argmax(test_ccf)] #%% Make plots ### plt.figure(2, figsize=[10, 10]) #plt.subplot(211) #plt.plot(tau_arr, ccf,'o') plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', color='C' + str(n), label=label) # plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed') # plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted')
def run(self, scaffold_stats): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. """ self.logger.info( "Calculating statistics for {:,} genomes over {:,} scaffolds.". format(scaffold_stats.num_genomes(), scaffold_stats.num_scaffolds())) self.coverage_headers = scaffold_stats.coverage_headers self.signature_headers = scaffold_stats.signature_headers genome_size = defaultdict(int) scaffold_length = defaultdict(list) gc = defaultdict(list) coverage = defaultdict(list) signature = defaultdict(list) for _scaffold_id, stats in scaffold_stats.stats.items(): if stats.genome_id == scaffold_stats.unbinned: continue genome_size[stats.genome_id] += stats.length scaffold_length[stats.genome_id].append(stats.length) gc[stats.genome_id].append(stats.gc) coverage[stats.genome_id].append(stats.coverage) signature[stats.genome_id].append(stats.signature) # record statistics for each genome genomic_signature = GenomicSignature(0) self.genome_stats = {} for genome_id in genome_size: # calculate weighted mean and median statistics weights = np_array(scaffold_length[genome_id]) len_array = np_array(scaffold_length[genome_id]) mean_len = ws.numpy_weighted_mean(len_array, weights) median_len = ws.numpy_weighted_median(len_array, weights) gc_array = np_array(gc[genome_id]) mean_gc = ws.numpy_weighted_mean(gc_array, weights) median_gc = ws.numpy_weighted_median(gc_array, weights) cov_array = np_array(coverage[genome_id]).T mean_cov = ws.numpy_weighted_mean(cov_array, weights) median_cov = [] for i in range(cov_array.shape[0]): median_cov.append( ws.numpy_weighted_median(cov_array[i, :], weights)) signature_array = np_array(signature[genome_id]).T mean_signature = ws.numpy_weighted_mean(signature_array, weights) # calculate mean and median tetranucleotide distance td = [] for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: stats = scaffold_stats.stats[scaffold_id] td.append( genomic_signature.manhattan(stats.signature, mean_signature)) self.genome_stats[genome_id] = self.GenomeStats( genome_size[genome_id], mean_len, median_len, mean_gc, median_gc, mean_cov, median_cov, mean_signature, np_mean(td), np_median(td)) return self.genome_stats
#%% Fit a parabola for those points around the centre of the ccf function ### sub_tau = np.arange(-7, 8) test_ccf = ccf[np.isin(tau_arr, sub_tau)] test_ccf_err = ccf_err[np.isin(tau_arr, sub_tau)] fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf, sigma=test_ccf_err) plot_tau = np.linspace(-7, 7, 100) ccf_fit = parabola(plot_tau, *fit_params) max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)] max_lag_fit_err[n] = bootstrapping_max(plot_tau, ccf_fit) #%% Find mean lag (centroid) in the centre ### mean_lag_test[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf) mean_lag[n], mean_lag_err[n] = weighted_mean_and_err(tau_arr, ccf) median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf) #%% Find lag where ccf is max in centre ### max_lag[n] = sub_tau[np.argmax(test_ccf)] max_ccf[n] = np.nanmax(test_ccf) #%% Create and save ccfs if prompted ### if save_ccf == True: plt.figure(figsize=[8, 8]) plt.title('DR11 ID: ' + str(bindata['ID']) + ' $\chi^{2}$ = ' + str(bindata['Chi_K'])) plt.errorbar(tau_arr, ccf, yerr=ccf_err,
import weightedstats as ws my_data = [1, 2, 3, 4, 5] my_weights = [10, 1, 1, 1, 9] # Ordinary (unweighted) mean and median print(ws.mean(my_data)) # equivalent to ws.weighted_mean(my_data) ws.median(my_data) # equivalent to ws.weighted_median(my_data) # Weighted mean and median ws.weighted_mean(my_data, weights=my_weights) ws.weighted_median(my_data, weights=my_weights) # Special weighted mean and median functions for use with numpy arrays ws.numpy_weighted_mean(my_data, weights=my_weights) ws.numpy_weighted_median(my_data, weights=my_weights)
#%% Calculate the CCF at various tau values ### out = np.array([ vari_funcs.cross_correlation.cross_correlate(corr_test_k_flux, corr_test_j_flux, tau, type='dcf') for tau in tau_arr ]) ### Unpack values ### ccf = out[:, 0] ccf_err = out[:, 1] #%% Find weighted mean and skew of ccf ### mean_lag[n] = ws.numpy_weighted_mean(tau_arr, weights=ccf) median_lag[n] = ws.numpy_weighted_median(tau_arr, weights=ccf) ccf_skew[n] = skew(ccf) #%% Make plots ### plt.figure(2, figsize=[10, 10]) #plt.subplot(211) #plt.plot(tau_arr, ccf,'o') plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', label=label) # plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed') # plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted') plt.xlabel('Lag (months)') plt.ylabel('Discrete Cross-Correlation Function') plt.ylim(-0.5, 0.9) plt.grid(True) plt.legend(loc='lower center')