def learn_falseamp(data, earthmodel, false_dets): (start_time, end_time, detections, leb_events, leb_evlist, site_up, sites, phasenames, phasetimedef, phaseprop, sitenames, ttime_prefix, ddrange_file, qfvc_file, hydro_dir, infra_dir) = data numsites = earthmodel.NumSites() truecnt = [0 for s in xrange(numsites)] site_raw = [[] for s in xrange(numsites)] for detnum in false_dets: siteid = int(detections[detnum, DET_SITE_COL]) amp = detections[detnum, DET_AMP_COL] if amp > 0: site_raw[siteid].append(np.log(amp)) phaseid = 0 for evnum, event in enumerate(leb_events): if event[EV_MEDIUM_COL] != MEDIUM_SEISMIC or event[EV_DEPTH_COL] > 10 \ or event[EV_MB_COL] < 2: continue for ph, detnum in leb_evlist[evnum]: if ph==phaseid: siteid = int(detections[detnum, DET_SITE_COL]) truecnt[siteid] += 1 print "False Amp" all_loc, all_scale = [], [] for siteid, raw in enumerate(site_raw): if truecnt[siteid] > 1000: print "siteid", siteid, loc, scale = cauchy.fit(raw) print "siteid", siteid, "Cauchy", loc, scale all_loc.append(loc) all_scale.append(scale) print "Gaussian of scale:", norm.fit(all_loc) print "InvGamma of scale:", invgamma_fit(all_scale)
def fit_cauchy(self, sample=131072): """Fit target-specific Cauchy distributions, and save to HDF5""" # sample SNPs sample_snps = random.sample(list(self.snp_indexes.values()), sample) # sort by chr chr_sample_snps = {} for ci, si in sample_snps: chr_sample_snps.setdefault(ci, []).append(si) for ci in chr_sample_snps: chr_sample_snps[ci] = sorted(chr_sample_snps[ci]) # read SNPs sad = [] for ci, csnps in chr_sample_snps.items(): print("Reading %s" % ci, flush=True) sad.append(self.chr_sad5[ci].sad_matrix[csnps]) sad = np.concatenate(sad).astype("float32") # initialize fit parameters self.target_cauchy_fit_loc = np.zeros(self.num_targets) self.target_cauchy_fit_scale = np.zeros(self.num_targets) # fit parameters for ti in range(self.num_targets): print(" Fitting t%d" % ti, flush=True) cp = cauchy.fit(sad[:, ti]) self.target_cauchy_fit_loc[ti] = cp[0] self.target_cauchy_fit_scale[ti] = cp[1] # write to HDF5 for chrm, sad5 in self.chr_sad5.items(): sad5.sad_h5_open.close() sad5.sad_h5_open = h5py.File(sad5.sad_h5_file, "r+") if "target_cauchy_fit_loc" in sad5.sad_h5_open: del sad5.sad_h5_open["target_cauchy_fit_loc"] del sad5.sad_h5_open["target_cauchy_fit_scale"] sad5.sad_h5_open.create_dataset("target_cauchy_fit_loc", data=self.target_cauchy_fit_loc) sad5.sad_h5_open.create_dataset("target_cauchy_fit_scale", data=self.target_cauchy_fit_scale) sad5.sad_h5_open.close() sad5.sad_h5_open = h5py.File(sad5.sad_h5_file, "r")
cv=5, verbose = 1, n_jobs = -2) grid.fit(dist_filt[:, None]) ### KDE representation kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], kernel='gaussian') kde.fit(dist_filt[:, None]) logprob_kde = kde.score_samples(x_d[:, None]) pdfkde = np.exp(logprob_kde) ### Fit a Cauchy distribution loc,scale = cauchy.fit(dist_filt) ncauchy = cauchy.pdf(x_d,loc=loc,scale=scale) ### Print info and plot print(idx,dmin,dmax,np.abs(np.mean(dist)),grid.best_params_['bandwidth'],data['metric'][idx]) p = ax.plot(x_d,pdfkde) axins.plot(wl_vec,f_eps(wl_vec,1)) if plot_cauchy: ax.plot(x_d,ncauchy,linestyle='dashed',color=p[-1].get_color()) idxM = np.argmax(pdfkde) ax.text(x_d[idxM],pdfkde[idxM],data['metric'][idx]) ### Maximum of all of the PDFs maxpdf = max(maxpdf,np.max(pdfkde))
from scipy.stats import cauchy, norm # noqa residuals = gandalfs.zenith - primaries.zenith cut = (gandalfs['lambda'] < l) & (np.abs(residuals) < 2 * np.pi) residuals = residuals[cut] info[cut] # convert rad -> deg residuals = residuals * 180 / np.pi pi = 180 # x axis for plotting x = np.linspace(-pi, pi, 1000) c_loc, c_gamma = cauchy.fit(residuals) fwhm = 2 * c_gamma g_mu_bad, g_sigma_bad = norm.fit(residuals) g_mu, g_sigma = norm.fit(residuals[np.abs(residuals) < 10]) plt.hist(residuals, bins='auto', label='Histogram', normed=True, alpha=.7) plt.plot( x, cauchy(c_loc, c_gamma).pdf(x), label='Lorentz: FWHM $=${:.3f}'.format(fwhm), linewidth=2 ) plt.plot( x, norm(g_mu_bad, g_sigma_bad).pdf(x),
sigarr_div = np.abs(sigarr_div) ijarr = np.array(ijarr) ### Sample the nwl * (nwl-1)/2 normal distributions Zarr = np.zeros(ncomb) for m in range(ncomb): sigrand = sigarr[m] # sigrand = 10 # sigrand = sigarr_expon[m] # sigrand = sigarr_div[m] # sigrand = np.random.choice(sigarr_expon) Xrand = norm.rvs(loc=0,scale=sigrand,size=1) Zarr[m] = Xrand ### Fit a Cauchy distribution loc,sca = cauchy.fit(Zarr) locnorm, scanorm = norm.fit(Zarr) dft, loct, scat = t.fit(Zarr) ### Compound distribution #sigarr[:] = sigrand #weights = 1/sigarr_expon #weights = weights / np.sum(weights) weights = np.ones_like(sigarr) pdf_cmb = lambda x: np.sum(weights * 1/sigarr * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr**2)) #pdf_cmb = lambda x: np.sum(weights * 1/sigarr_expon * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr_expon**2)) #pdf_cmb = lambda x: np.sum(weights * 1/sigarr_div * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr_div**2)) ### Buhlmann #v2 = np.var(sigarr)
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='sad_norm') parser.add_option( '-s', dest='sample', default=100000, type='int', help='Number of SNPs to sample for fit [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide SAD HDF5 path') else: sad_h5_path = args[0] # retrieve chromosome SAD HDF5 files chr_sad_h5_files = sorted(glob.glob('%s/*/sad.h5' % sad_h5_path)) assert (len(chr_sad_h5_files) > 0) # clean out any existing fits # count SNPs across chromosomes num_snps = 0 for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+') # delete fit params if 'target_cauchy_fit_loc' in chr_sad_h5.keys(): del chr_sad_h5['target_cauchy_fit_loc'] del chr_sad_h5['target_cauchy_fit_scale'] # delete norm params if 'target_cauchy_norm_loc' in chr_sad_h5.keys(): del chr_sad_h5['target_cauchy_norm_loc'] del chr_sad_h5['target_cauchy_norm_scale'] # count SNPs num_snps += chr_sad_h5['SAD'].shape[0] num_targets = chr_sad_h5['SAD'].shape[-1] chr_sad_h5.close() # sample SNPs across chromosomes sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets) # initialize fit parameters target_cauchy_fit_loc = np.zeros(num_targets) target_cauchy_fit_scale = np.zeros(num_targets) # fit parameters for ti in range(num_targets): print('Fitting t%d' % ti, flush=True) cp = cauchy.fit(sad[:, ti]) target_cauchy_fit_loc[ti] = cp[0] target_cauchy_fit_scale[ti] = cp[1] del sad # write across chromosomes for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+') chr_sad_h5.create_dataset('target_cauchy_fit_loc', data=target_cauchy_fit_loc) chr_sad_h5.create_dataset('target_cauchy_fit_scale', data=target_cauchy_fit_scale) chr_sad_h5.close() # compute normalization parameters for chr_sad_h5_file in chr_sad_h5_files: chr_sad5 = SAD5(chr_sad_h5_file) # QC fit table if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) fit_out = open('%s/fits.txt' % options.out_dir, 'w') for ti in range(num_targets): print('%-4d %7.1e %7.1e' % (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]), file=fit_out) fit_out.close() # QC quantiles quantile_dir = '%s/quantiles' % options.out_dir if not os.path.isdir(quantile_dir): os.mkdir(quantile_dir) sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets) for ti in np.linspace(0, num_targets - 1, 64, dtype='int'): # compute cauchy and argsort quantiles cauchy_q = cauchy.cdf(sad_qc[:, ti], loc=target_cauchy_fit_loc[ti], scale=target_cauchy_fit_scale[ti]) sort_i = np.argsort(sad_qc[:, ti]) quantile_pdf = '%s/t%d.pdf' % (quantile_dir, ti) jointplot(np.linspace(0, 1, len(sort_i)), cauchy_q[sort_i], quantile_pdf, square=True, cor=None, x_label='Empirical', y_label='Cauchy') # QC plots norm_dir = '%s/norm' % options.out_dir if not os.path.isdir(norm_dir): os.mkdir(norm_dir) chr_sad5 = SAD5(chr_sad_h5_files[0]) qc_sample = 2048 if qc_sample < chr_sad5.num_snps: ri = sorted( np.random.choice(np.arange(chr_sad5.num_snps), size=qc_sample, replace=False)) else: ri = np.arange(chr_sad5.num_snps) qc_sad_raw = chr_sad5.sad_matrix[ri] qc_sad_norm = chr_sad5[ri] for ti in np.linspace(0, num_targets - 1, 32, dtype='int'): plt.figure() sns.jointplot(qc_sad_raw[:, ti], qc_sad_norm[:, ti], joint_kws={ 'alpha': 0.5, 's': 10 }) plt.savefig('%s/t%d.pdf' % (norm_dir, ti)) plt.close()
def plot_histogram(filename, column_names=[], skip_cols=[], nbins=10, trimends=False, autosave=False, save_directory='', save_format='svg', delimiter=None): """ Plots a histogram formed from the columns of the specified file. If column_names is specified, the titles of the plots will be renamed accordingly. Otherwise "Title" is inserted instead. skip_cols specifies any columns in the data that should be skipped. Columns at the end of the line may be skipped by using negative numbers. In this scheme the last column in a row is -1. """ infile = open(filename, 'r') if(delimiter): data = loadtxt(infile, dtype=float, delimiter=',') else: data = loadtxt(infile, dtype=float) infile.close() end_col = data.shape[1] norm_stats = list() cauchy_stats = list() # Reinterpret any negative numbers in skip_cols to be at the end of the line for column in range(0, len(skip_cols)): if skip_cols[column] < 0: skip_cols[column] = end_col + skip_cols[column] namecol = 0 for column in range(0, end_col): # Skip the column if instructed to do so: if(column in skip_cols): continue; # extract the data column: temp = data[:,column] if(trimends): minval = min(temp) maxval = max(temp) temp = filter(lambda x: x > minval, temp) temp = filter(lambda x: x < maxval, temp) # plot a histogram of the data: [n, bins, patches] = plt.hist(temp, bins=nbins, normed=True, label='Binned data') # fit a normal distribution: [norm_mu, norm_sigma] = norm.fit(temp) y = mlab.normpdf(bins, norm_mu, norm_sigma) legend_gauss = r'Normal: $\mu=%.3f,\ \sigma=%.3f$' % (norm_mu, norm_sigma) l = plt.plot(bins, y, 'r--', linewidth=2, label=legend_gauss) # fit a Lorentz/Cauchy distribution: # bug workaround for http://projects.scipy.org/scipy/ticket/1530 # - specify a starting centroid value for the fit [cauchy_mu, cauchy_gamma] = cauchy.fit(temp, loc=norm_mu) y = cauchy.pdf(bins, loc=cauchy_mu, scale=cauchy_gamma) legend_cauchy = r'Cauchy: $\mu=%.3f,\ \gamma=%.3f$' % (cauchy_mu, cauchy_gamma) l = plt.plot(bins, y, 'g--', linewidth=2, label=legend_cauchy) # now setup the axes labels: try: title = column_names[namecol] namecol += 1 except: title = "Title" plt.title(title) plt.xlabel("Value") plt.ylabel("Frequency") plt.legend(loc='best') if autosave: plt.savefig(save_directory + '/stats_hist_' + title + '.' + save_format, transparent=True, format=save_format) plt.close() else: plt.show() # Add in the statistical information. norm_stats.append([title, norm_mu, norm_sigma]) cauchy_stats.append([title, cauchy_mu, cauchy_gamma]) # Now either print out or save the statistical information if(not autosave): print "Normal Statistics:" write_statistics(save_directory + '/stats_normal.txt', norm_stats, autosave) if(not autosave): print "Cauchy Statistics:" write_statistics(save_directory + '/stats_cauchy.txt', cauchy_stats, autosave)
bins=np.linspace(-0.5, 0.5, 100), density=True) iqrs = np.percentile(strided['dspeed_rel'], [25, 75]) axs[0].axvline(iqrs[0], color='k') axs[0].axvline(iqrs[1], color='k') axs[1].hist(strided['dhdg_rel'], bins=np.linspace(-3.0, 3.0, 100), density=True) iqrs = np.percentile(strided['dhdg_rel'], [25, 75]) axs[1].axvline(iqrs[0], color='k') axs[1].axvline(iqrs[1], color='k') axs[0].set_title(f'Stride: {stride}') res = cauchy.fit(strided['dspeed_rel']) loc, scale = res samp = np.linspace(-0.5, 0.5, 100) pdf = cauchy.pdf(samp, *res) axs[0].plot(samp, pdf, color='tab:orange') axs[0].text(0.05, 0.85, f"Cauchy(loc={loc:.3f}, scale={scale:.3f})", transform=axs[0].transAxes) res = cauchy.fit(strided['dhdg_rel']) loc, scale = res samp = np.linspace(-np.pi, np.pi, 100) pdf = cauchy.pdf(samp, *res) axs[1].plot(samp, pdf, color='tab:orange') axs[1].text(0.05,
def fit_cauchy(sad, ti): print('Fitting t%d' % ti) return cauchy.fit(sad[:, ti])
wl_vec = np.linspace(lambda_0, lambda_N, (int)(3000)) pix_vec = np.linspace(0, 2999, 3000) pix_vec = np.array(pix_vec, dtype=np.int64) ncomb = len(chosen_pix) ncomb = (int)(nwl * (nwl - 1) / 2) _, dToT = generate_Taverage_distribution( T0, wl_vec, pix_vec, nwl) muTbar, sigTbar, ratio, muThat, sigThat = compute_high_order_variance( T0, sigma_I, w) # _ = plt.hist(dToT[(dToT<0.1)&(dToT>-0.1)],bins=100,normed=True,histtype='step') # dToT = dToT[(dToT<0.1)&(dToT>-0.1)] loc, sca = cauchy.fit(dToT) x_d = np.linspace(-0.1, 0.1, 1000) plt.plot(x_d, cauchy.pdf(x_d, loc=loc, scale=sca), 'k-') mu = np.average(dToT) sig = np.std(dToT) skw = skew(dToT) krt = kurtosis(dToT, fisher=False) res.append([mu, sig, skw, krt]) res = np.array(res) #plt.xlim([-0.1,0.1]) #avec = np.arange(20,200,20) #for alpha in avec:
from scipy.stats import cauchy, norm # noqa residuals = gandalfs.zenith - primaries.zenith cut = (gandalfs["lambda"] < l) & (np.abs(residuals) < 2 * np.pi) residuals = residuals[cut] event_info[cut] # convert rad -> deg residuals = residuals * 180 / np.pi pi = 180 # x axis for plotting x = np.linspace(-pi, pi, 1000) c_loc, c_gamma = cauchy.fit(residuals) fwhm = 2 * c_gamma g_mu_bad, g_sigma_bad = norm.fit(residuals) g_mu, g_sigma = norm.fit(residuals[np.abs(residuals) < 10]) plt.hist(residuals, bins="auto", label="Histogram", density=True, alpha=0.7) plt.plot( x, cauchy(c_loc, c_gamma).pdf(x), label="Lorentz: FWHM $=${:.3f}".format(fwhm), linewidth=2, ) plt.plot( x, norm(g_mu_bad, g_sigma_bad).pdf(x),
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("-o", dest="out_dir", default="sad_norm") parser.add_option( "-s", dest="sample", default=100000, type="int", help="Number of SNPs to sample for fit [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide SAD HDF5 path") else: sad_h5_path = args[0] # retrieve chromosome SAD HDF5 files chr_sad_h5_files = sorted(glob.glob("%s/*/sad.h5" % sad_h5_path)) assert len(chr_sad_h5_files) > 0 # clean out any existing fits # count SNPs across chromosomes num_snps = 0 for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+") # delete fit params if "target_cauchy_fit_loc" in chr_sad_h5.keys(): del chr_sad_h5["target_cauchy_fit_loc"] del chr_sad_h5["target_cauchy_fit_scale"] # delete norm params if "target_cauchy_norm_loc" in chr_sad_h5.keys(): del chr_sad_h5["target_cauchy_norm_loc"] del chr_sad_h5["target_cauchy_norm_scale"] # count SNPs num_snps += chr_sad_h5["SAD"].shape[0] num_targets = chr_sad_h5["SAD"].shape[-1] chr_sad_h5.close() # sample SNPs across chromosomes sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets) # initialize fit parameters target_cauchy_fit_loc = np.zeros(num_targets) target_cauchy_fit_scale = np.zeros(num_targets) # fit parameters for ti in range(num_targets): print("Fitting t%d" % ti, flush=True) cp = cauchy.fit(sad[:, ti]) target_cauchy_fit_loc[ti] = cp[0] target_cauchy_fit_scale[ti] = cp[1] del sad # write across chromosomes for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+") chr_sad_h5.create_dataset("target_cauchy_fit_loc", data=target_cauchy_fit_loc) chr_sad_h5.create_dataset( "target_cauchy_fit_scale", data=target_cauchy_fit_scale ) chr_sad_h5.close() # compute normalization parameters for chr_sad_h5_file in chr_sad_h5_files: chr_sad5 = SAD5(chr_sad_h5_file) # QC fit table if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) fit_out = open("%s/fits.txt" % options.out_dir, "w") for ti in range(num_targets): print( "%-4d %7.1e %7.1e" % (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]), file=fit_out, ) fit_out.close() # QC quantiles quantile_dir = "%s/quantiles" % options.out_dir if not os.path.isdir(quantile_dir): os.mkdir(quantile_dir) sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets) for ti in np.linspace(0, num_targets - 1, 64, dtype="int"): # compute cauchy and argsort quantiles cauchy_q = cauchy.cdf( sad_qc[:, ti], loc=target_cauchy_fit_loc[ti], scale=target_cauchy_fit_scale[ti], ) sort_i = np.argsort(sad_qc[:, ti]) quantile_pdf = "%s/t%d.pdf" % (quantile_dir, ti) jointplot( np.linspace(0, 1, len(sort_i)), cauchy_q[sort_i], quantile_pdf, square=True, cor=None, x_label="Empirical", y_label="Cauchy", ) # QC plots norm_dir = "%s/norm" % options.out_dir if not os.path.isdir(norm_dir): os.mkdir(norm_dir) chr_sad5 = SAD5(chr_sad_h5_files[0]) qc_sample = 2048 if qc_sample < chr_sad5.num_snps: ri = sorted( np.random.choice( np.arange(chr_sad5.num_snps), size=qc_sample, replace=False ) ) else: ri = np.arange(chr_sad5.num_snps) qc_sad_raw = chr_sad5.sad_matrix[ri] qc_sad_norm = chr_sad5[ri] for ti in np.linspace(0, num_targets - 1, 32, dtype="int"): plt.figure() sns.jointplot( qc_sad_raw[:, ti], qc_sad_norm[:, ti], joint_kws={"alpha": 0.5, "s": 10} ) plt.savefig("%s/t%d.pdf" % (norm_dir, ti)) plt.close()
def plot_histogram(filename, column_names=[], skip_cols=[], nbins=10, trimends=False, autosave=False, save_directory='', save_format='svg', delimiter=None): """ Plots a histogram formed from the columns of the specified file. If column_names is specified, the titles of the plots will be renamed accordingly. Otherwise "Title" is inserted instead. skip_cols specifies any columns in the data that should be skipped. Columns at the end of the line may be skipped by using negative numbers. In this scheme the last column in a row is -1. """ infile = open(filename, 'r') if (delimiter): data = loadtxt(infile, dtype=float, delimiter=',') else: data = loadtxt(infile, dtype=float) infile.close() end_col = data.shape[1] norm_stats = list() cauchy_stats = list() # Reinterpret any negative numbers in skip_cols to be at the end of the line for column in range(0, len(skip_cols)): if skip_cols[column] < 0: skip_cols[column] = end_col + skip_cols[column] namecol = 0 for column in range(0, end_col): # Skip the column if instructed to do so: if (column in skip_cols): continue # extract the data column: temp = data[:, column] if (trimends): minval = min(temp) maxval = max(temp) temp = filter(lambda x: x > minval, temp) temp = filter(lambda x: x < maxval, temp) # plot a histogram of the data: [n, bins, patches] = plt.hist(temp, bins=nbins, normed=True, label='Binned data') # fit a normal distribution: [norm_mu, norm_sigma] = norm.fit(temp) y = mlab.normpdf(bins, norm_mu, norm_sigma) legend_gauss = r'Normal: $\mu=%.3f,\ \sigma=%.3f$' % (norm_mu, norm_sigma) l = plt.plot(bins, y, 'r--', linewidth=2, label=legend_gauss) # fit a Lorentz/Cauchy distribution: # bug workaround for http://projects.scipy.org/scipy/ticket/1530 # - specify a starting centroid value for the fit [cauchy_mu, cauchy_gamma] = cauchy.fit(temp, loc=norm_mu) y = cauchy.pdf(bins, loc=cauchy_mu, scale=cauchy_gamma) legend_cauchy = r'Cauchy: $\mu=%.3f,\ \gamma=%.3f$' % (cauchy_mu, cauchy_gamma) l = plt.plot(bins, y, 'g--', linewidth=2, label=legend_cauchy) # now setup the axes labels: try: title = column_names[namecol] namecol += 1 except: title = "Title" plt.title(title) plt.xlabel("Value") plt.ylabel("Frequency") plt.legend(loc='best') if autosave: plt.savefig(save_directory + '/stats_hist_' + title + '.' + save_format, transparent=True, format=save_format) plt.close() else: plt.show() # Add in the statistical information. norm_stats.append([title, norm_mu, norm_sigma]) cauchy_stats.append([title, cauchy_mu, cauchy_gamma]) # Now either print out or save the statistical information if (not autosave): print "Normal Statistics:" write_statistics(save_directory + '/stats_normal.txt', norm_stats, autosave) if (not autosave): print "Cauchy Statistics:" write_statistics(save_directory + '/stats_cauchy.txt', cauchy_stats, autosave)
def assign_sex(sextable, Rx_init, soft='Cauchy'): """assign sex to samples using k-means clustering""" Rx = map(operator.itemgetter(6), sextable) centroid, hard_classification = kmeans2(Rx, np.array([0.5 * Rx_init, Rx_init]), minit='matrix') Rx_m = [Rx[i] for i, j in enumerate(hard_classification) if j == 0] Rx_f = [Rx[i] for i, j in enumerate(hard_classification) if j == 1] m_mu, m_std = centroid[0], np.std(Rx_m) f_mu, f_std = centroid[1], np.std(Rx_f) if soft is 'Normal': # Rx~Normal m_dist = norm(m_mu, m_std) f_dist = norm(f_mu, f_std) elif soft is 'Beta': # Rx~Beta - nasty estimation and looks like the normal anyway mMx = [Mx[i] for i, j in enumerate(hard_classification) if j == 0] mMa = [Ma[i] for i, j in enumerate(hard_classification) if j == 0] fMx = [Mx[i] for i, j in enumerate(hard_classification) if j == 1] fMa = [Ma[i] for i, j in enumerate(hard_classification) if j == 1] mloc, mscale = beta.fit_loc_scale(Rx_m, np.median(mMx), np.median(mMa)) ma, mb, mloc, mscale = beta.fit(Rx_m, floc=mloc, fscale=mscale) #print(ma, mb, mloc, mscale) floc, fscale = beta.fit_loc_scale(Rx_f, np.median(fMx), np.median(fMa)) fa, fb, floc, fscale = beta.fit(Rx_f, floc=floc, fscale=fscale) #print(fa, fb, floc, fscale) m_dist = beta(ma, mb, loc=mloc, scale=mscale) f_dist = beta(fa, fb, loc=floc, scale=fscale) elif soft is 'Cauchy': # Rx~Cauchy - assumption of independence for num/denom is violated, but otherwise seems sensible m_loc, m_scale = cauchy.fit(Rx_m) f_loc, f_scale = cauchy.fit(Rx_f) m_dist = cauchy(m_loc, m_scale) f_dist = cauchy(f_loc, f_scale) # use Cauchy central tendency m_mu = m_loc f_mu = f_loc m_bound = m_dist.ppf(0.95) f_bound = f_dist.ppf(0.05) m_int = m_dist.interval(0.99) f_int = f_dist.interval(0.99) m_int = (max(0.0, m_int[0]), min(1.0, m_int[1])) f_int = (max(0.0, f_int[0]), min(1.0, f_int[1])) soft_classification = [] for x in Rx: if x < m_bound: soft_classification.append(0) elif x > f_bound: soft_classification.append(1) else: soft_classification.append(None) if True: zn = 100.0 logit_pm = np.empty_like(Rx) cntrd = centroid for i, _ in enumerate(Rx): Rx_i = Rx[:i] + Rx[i + 1:] #cntrd, _ = kmeans2(Rx_i, np.array([0.5*Rx_init, Rx_init]), minit='matrix') x, a = sextable[i][1], sextable[i][3] p_m = 0 p_f = 0 for z in np.linspace(m_int[0], m_int[1], zn): p_m += binom.pmf(x, x + a, z) * (m_dist.cdf(z + 0.5 / zn) - m_dist.cdf(z - 0.5 / zn)) for z in np.linspace(f_int[0], f_int[1], zn): p_f += binom.pmf(x, x + a, z) * (f_dist.cdf(z + 0.5 / zn) - f_dist.cdf(z - 0.5 / zn)) #p_m /= zn #p_f /= zn if p_m == 0.0: logit_pm[i] = -math.log(p_f) elif p_f == 0.0: logit_pm[i] = math.log(p_m) else: logit_pm[i] = math.log(p_m) - math.log(p_f) #print(p_m, p_f, logit_pm[i]) #p_m = binom.logsf(x-1, x+a, cntrd[0]) #p_f = binom.logcdf(x-1, x+a, cntrd[1]) #logit_pm[i] = p_m - p_f return hard_classification, soft_classification, m_dist, f_dist, m_mu, f_mu, m_bound, f_bound, logit_pm