def identify_centrals(data, mask=None, filename='subhalo_central_flags-v2.fits', rank=0, size=1): groups = np.unique(data['groupid']) Ngrp = len(groups) print 'Will process data for %d groups'%Ngrp i0=0 if mask is None: mask = np.isfinite(data['pos'].T[0]) & np.isfinite(data['pos'].T[1]) & np.isfinite(data['pos'].T[2]) flags = np.zeros(data.size, dtype=[('subhalo_id',int),('central1',int),('central2',int),('central3',int)]) ident = np.arange(0, len(data), 1) for i, group in enumerate(groups): if i%size!=rank: continue select = (data['groupid'][mask]==group) N = len(data['groupid'][mask][select]) print i, group, N if N<2: continue M = data['mass'][mask][select] xrand = np.random.choice(data['pos'].T[0][mask][select]) yrand = np.random.choice(data['pos'].T[1][mask][select]) zrand = np.random.choice(data['pos'].T[2][mask][select]) import weightedstats as ws sane = (abs(data['pos'].T[0][mask][select]-xrand)<0.1e5) & (abs(data['pos'].T[1][mask][select]-yrand)<0.1e5) & (abs(data['pos'].T[2][mask][select]-zrand)<0.1e5) x0 = ws.numpy_weighted_median(data['pos'].T[0][mask][select][np.isfinite(data['pos'].T[0][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[0][mask][select])]) y0 = ws.numpy_weighted_median(data['pos'].T[1][mask][select][np.isfinite(data['pos'].T[1][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[1][select])]) z0 = ws.numpy_weighted_median(data['pos'].T[2][mask][select][np.isfinite(data['pos'].T[2][mask][select]) & sane], weights=M[sane & np.isfinite(data['pos'].T[2][mask][select])]) #x0 = np.sum(M*data['pos'].T[0][mask][select])/np.sum(M) #y0 = np.sum(M*data['pos'].T[1][mask][select])/np.sum(M) #z0 = np.sum(M*data['pos'].T[2][mask][select])/np.sum(M) R = np.sqrt((data['pos'].T[0][mask][select]-x0)**2 + (data['pos'].T[1][mask][select]-y0)**2 + (data['pos'].T[2][mask][select]-z0)**2) select_cent1 = R==R[np.isfinite(R)].min() icent1 = ident[mask][select][select_cent1][0] flags['central1'][icent1] = 1 select_cent2 = M==M[np.isfinite(M)].max() icent2 = ident[mask][select][select_cent2][0] flags['central2'][icent2] = 1 Mb = data['massbytype'].T[4][mask][select] select_cent3 = Mb==Mb[np.isfinite(Mb)].max() icent3 = ident[mask][select][select_cent2][0] flags['central3'][icent3] = 1 #import pdb ; pdb.set_trace() i0+=N print 'Saving flags', filename outfits = fi.FITS(filename.replace('.fits','-%d.fits'%rank), 'rw') outfits.write(flags) outfits.close() return flags
### Unpack values ### ccf = out[:, 0] ccf_err = out[:, 1] #%% Fit a parabola for those points around the centre of the ccf function ### sub_tau = np.arange(-10, 10) test_ccf = ccf[np.isin(tau_arr, sub_tau)] fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf) plot_tau = np.linspace(-5, 6, 30) ccf_fit = parabola(plot_tau, *fit_params) max_lag_fit[n] = plot_tau[np.argmax(ccf_fit)] #%% Find weighted mean and skew of ccf ### mean_lag[n] = ws.numpy_weighted_mean(sub_tau, weights=test_ccf) median_lag[n] = ws.numpy_weighted_median(sub_tau, weights=test_ccf) max_lag[n] = sub_tau[np.argmax(test_ccf)] #%% Make plots ### plt.figure(2, figsize=[10, 10]) #plt.subplot(211) #plt.plot(tau_arr, ccf,'o') plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', color='C' + str(n), label=label) # plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed') # plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted') # plt.plot(plot_tau, ccf_fit, 'C'+str(n))
def run(self, scaffold_stats): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. """ self.logger.info( "Calculating statistics for {:,} genomes over {:,} scaffolds.". format(scaffold_stats.num_genomes(), scaffold_stats.num_scaffolds())) self.coverage_headers = scaffold_stats.coverage_headers self.signature_headers = scaffold_stats.signature_headers genome_size = defaultdict(int) scaffold_length = defaultdict(list) gc = defaultdict(list) coverage = defaultdict(list) signature = defaultdict(list) for _scaffold_id, stats in scaffold_stats.stats.items(): if stats.genome_id == scaffold_stats.unbinned: continue genome_size[stats.genome_id] += stats.length scaffold_length[stats.genome_id].append(stats.length) gc[stats.genome_id].append(stats.gc) coverage[stats.genome_id].append(stats.coverage) signature[stats.genome_id].append(stats.signature) # record statistics for each genome genomic_signature = GenomicSignature(0) self.genome_stats = {} for genome_id in genome_size: # calculate weighted mean and median statistics weights = np_array(scaffold_length[genome_id]) len_array = np_array(scaffold_length[genome_id]) mean_len = ws.numpy_weighted_mean(len_array, weights) median_len = ws.numpy_weighted_median(len_array, weights) gc_array = np_array(gc[genome_id]) mean_gc = ws.numpy_weighted_mean(gc_array, weights) median_gc = ws.numpy_weighted_median(gc_array, weights) cov_array = np_array(coverage[genome_id]).T mean_cov = ws.numpy_weighted_mean(cov_array, weights) median_cov = [] for i in range(cov_array.shape[0]): median_cov.append( ws.numpy_weighted_median(cov_array[i, :], weights)) signature_array = np_array(signature[genome_id]).T mean_signature = ws.numpy_weighted_mean(signature_array, weights) # calculate mean and median tetranucleotide distance td = [] for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: stats = scaffold_stats.stats[scaffold_id] td.append( genomic_signature.manhattan(stats.signature, mean_signature)) self.genome_stats[genome_id] = self.GenomeStats( genome_size[genome_id], mean_len, median_len, mean_gc, median_gc, mean_cov, median_cov, mean_signature, np_mean(td), np_median(td)) return self.genome_stats
def fix_px_med(self, px_val, data, weights): # weighted median. Masked pixels have weight zero. try: return px_val - ws.numpy_weighted_median(data, weights=weights) except: return px_val
def eval_deviation(self): """ Calculate the weighted median weighted median deviation from piano key frequencies. :) :return: """ # Initialize windowing variables hopsize = self.deviation_param_dict["hopsize"] windowsize = self.deviation_param_dict["windowsize"] windowfunc = self.deviation_param_dict["windowfunc"] fftsize = self.deviation_param_dict["fftsize"] windowsize_p1 = windowsize + 1 overlap = windowsize_p1 - hopsize window = windowfunc(windowsize) energy_constant = windowsize * np.linalg.norm(window)**2.0 # Read the harmonic signal (TODO: Shouldn't have to do this, should already have harmonic spectrogram available) sig = AudioSignal(self.intermediate_harmonic_filename) samplerate = sig.samplerate max_piano_key_frequency_normalized = self.MAX_PIANO_KEY_FREQUENCY / samplerate # Set bounds on reassignment frequency rf_upper_bound = min([0.5, max_piano_key_frequency_normalized ]) # 0.5 is Nyquist lower_cutoff_freq = self.deviation_param_dict["lower_cutoff_freq"] lower_bound_freq = max([lower_cutoff_freq, self.A0_FREQUENCY]) rf_lower_bound = lower_bound_freq / samplerate # Initialize per-frame lists of log-energies and of median deviations from piano key frequencies logenergies_per_stft_frame = [] medians_per_stft_frame = [] # Set cutoffs for thresholding the log magnitudes and log energies, and epsilon for calculating log log_cutoff_freqbin = self.deviation_param_dict[ "log_cutoff_dB_freqbin"] / 20 log_cutoff_stft_frame = self.deviation_param_dict[ "log_cutoff_dB_stft_frame"] / 20 eps_logmag = self.deviation_param_dict["eps"] # Get the number of audio frames, and seek to the the first audio frame (no boundary treatment TODO???) frame0 = 0 num_audio_frames = sig.get_num_frames_from_and_seek_start( start_frame=frame0) # Now calculate the max number of FULL non-boundary frames you need to compute RF, # considering hop size and window size. num_full_rf_frames = 1 + ( (num_audio_frames - windowsize_p1) // hopsize) # Convert that to the number of audio frames that you'll analyze for non-boundary RF. (TODO???) num_audio_frames_full_rf = (num_full_rf_frames - 1) * hopsize + windowsize_p1 # Feed blocks to create the non-boundary RF frames (TODO???) blockreader = sig.blocks(blocksize=windowsize_p1, overlap=overlap, frames=num_audio_frames_full_rf, always_2d=True) np.set_printoptions(threshold=np.inf) for block in blockreader: block = block.T # First transpose to get each channel as a row try: wft = self._wft(block[:, :windowsize], window, fftsize) # Calculate windowed fft of signal except ValueError: print("Current frame at which there is an error: {}".format( frame0)) raise wft_plus = self._wft( block[:, 1:], window, fftsize) # Calculate windowed fft of shifted signal logabswft = np.log10(np.abs(wft) + eps_logmag) # Calculate reassignment frequencies (unit: normalized frequency) and deal with edge cases # Threshold the logabswft rf = self._calculate_rf(wft, wft_plus) in_bounds = np.where((rf >= rf_lower_bound) & (rf <= rf_upper_bound) & (logabswft >= log_cutoff_freqbin)) logabswft = logabswft[in_bounds] rf = rf[in_bounds] magwftsq = np.power(10., 2 * logabswft) # Now calculate the deviations from the nearest piano key and get weights, then append weighted median # Note that I do multiply by samplerate/self.A0_FREQUENCY, # instead of division by rf_lower_bound, just in case rf_lower_bound gets changed. rf_logarithmic = 12 * np.log2(rf * samplerate / self.A0_FREQUENCY) nearest_piano_keys = np.round(rf_logarithmic).astype(int) deviations = rf_logarithmic - nearest_piano_keys if np.size(deviations): median = ws.numpy_weighted_median( deviations, weights=magwftsq) # Mag-squared works, log-mag doesn't medians_per_stft_frame.append(median) # Now calculate the frame's log energy and append logenergy = np.log10((np.linalg.norm(wft)**2.0 / energy_constant) + eps_logmag) logenergies_per_stft_frame.append(logenergy) frame0 += hopsize # After looping through all blocks, soft threshold log energies and calculate the final weighted median # deviation logenergies_per_stft_frame = np.asarray(logenergies_per_stft_frame) out_of_bounds = np.where( logenergies_per_stft_frame < log_cutoff_stft_frame) logenergies_per_stft_frame[out_of_bounds] = log_cutoff_stft_frame logenergies_per_stft_frame -= log_cutoff_stft_frame # Necessary to make weights positive return ws.numpy_weighted_median(np.asarray(medians_per_stft_frame), logenergies_per_stft_frame)
import weightedstats as ws my_data = [1, 2, 3, 4, 5] my_weights = [10, 1, 1, 1, 9] # Ordinary (unweighted) mean and median print(ws.mean(my_data)) # equivalent to ws.weighted_mean(my_data) ws.median(my_data) # equivalent to ws.weighted_median(my_data) # Weighted mean and median ws.weighted_mean(my_data, weights=my_weights) ws.weighted_median(my_data, weights=my_weights) # Special weighted mean and median functions for use with numpy arrays ws.numpy_weighted_mean(my_data, weights=my_weights) ws.numpy_weighted_median(my_data, weights=my_weights)
out = np.array([ vari_funcs.cross_correlation.cross_correlate(corr_test_k_flux, corr_test_j_flux, tau, type='dcf') for tau in tau_arr ]) ### Unpack values ### ccf = out[:, 0] ccf_err = out[:, 1] #%% Find weighted mean and skew of ccf ### mean_lag[n] = ws.numpy_weighted_mean(tau_arr, weights=ccf) median_lag[n] = ws.numpy_weighted_median(tau_arr, weights=ccf) ccf_skew[n] = skew(ccf) #%% Make plots ### plt.figure(2, figsize=[10, 10]) #plt.subplot(211) #plt.plot(tau_arr, ccf,'o') plt.errorbar(tau_arr, ccf, yerr=ccf_err, fmt='o', label=label) # plt.vlines(mean_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dashed') # plt.vlines(median_lag[n], -0.015,0.02, color='C'+str(n), linestyle='dotted') plt.xlabel('Lag (months)') plt.ylabel('Discrete Cross-Correlation Function') plt.ylim(-0.5, 0.9) plt.grid(True) plt.legend(loc='lower center') plt.tight_layout()
if dez == 'Y' or dez == 'y': ### do ccf with deredshifted light curves #### z = varydata['z_use'] out = np.array([vari_funcs.cross_correlation.cross_correlate_de_z( corr_test_k_flux, corr_test_j_flux, tau, z, type='dcf') for tau in tau_arr]) else: out = np.array([vari_funcs.cross_correlation.cross_correlate( corr_test_k_flux, corr_test_j_flux, tau, type='dcf') for tau in tau_arr]) ### Unpack values ### ccf = out[:,0] ccf_err = out[:,1] #%% Find weighted mean and skew of ccf ### mean_lag, mean_lag_err = weighted_mean_and_err(tau_arr, ccf) median_lag = ws.numpy_weighted_median(tau_arr, weights=ccf) ccf_skew = skew(ccf) max_lag = tau_arr[np.argmax(ccf)] #%% Fit a parabola for those points around the centre of the ccf function ### sub_tau = np.arange(-5,6) test_ccf = ccf[np.isin(tau_arr, sub_tau)] fit_params, pcov = scipy.optimize.curve_fit(parabola, sub_tau, test_ccf) plot_tau = np.linspace(-5,6, 30) ccf_fit = parabola(plot_tau, *fit_params) max_lag_fit = plot_tau[np.argmax(ccf_fit)] #%% Make plots ### plt.figure(2,figsize=[10,10]) #plt.subplot(211) #plt.plot(tau_arr, ccf,'o')
np.where(humanreadable == sample[i])[0][0] for i in np.arange(numSamples) ] sample_table = (np.hstack( (np.reshape(humanreadable[indices], (numSamples, 1)), np.reshape(ev[indices], (numSamples, 1)), np.reshape(box[indices], (numSamples, 1)), np.reshape(weight[indices] / 100, (numSamples, 1))))) return sample_table print("Mean EV: {}".format(np.mean(ev))) print("Median EV: {}".format(np.median(ev))) print("Weighted Average EV: {}".format(np.average(ev, weights=weight))) print("Weighted Median EV: {}".format( ws.numpy_weighted_median(ev, weights=weight))) joined = sortByEV(ev, weight) print("Percentage of boxes under the median EV: {}".format( np.sum(joined[:9, 1]) / 100)) print("Percentage of boxes with EV over cost: {}".format( np.sum(joined[-12:, 1]) / 100)) print("Mean Box: {}".format(np.mean(box))) print("Median Box: {}".format(np.median(box))) print("Weighted Average Box: {}".format(np.average(box, weights=weight))) print("Weighted Median Box: {}".format( ws.numpy_weighted_median(box, weights=weight))) joinedBox = sortByEV(box, weight) print("Percentage of boxes under the median Cost: {}".format( np.sum(joinedBox[:8, 1]) / 100)) print("Percentage of boxes with Cost >= 109.99: {}".format(