def correlations(outputdir, genos, probesetfreeze): print probesetfreeze probesetfreezeid = probesetfreeze[0] probesetfreezename = probesetfreeze[1] probesetfreezefullname = probesetfreeze[2] # outputfile = open("%s/%d_%s.txt" % (outputdir, probesetfreezeid, probesetfreezename), "w+") outputfile.write("%s\t" % "ProbeSet Id") outputfile.write("%s\t" % "ProbeSet Name") outputfile.write("%s\t" % "Geno Name") outputfile.write("%s\t" % "Overlap Number") outputfile.write("%s\t" % "Pearson r") outputfile.write("%s\t" % "Pearson p") outputfile.write("%s\t" % "Spearman r") outputfile.write("%s\t" % "Spearman p") outputfile.write("\n") outputfile.flush() # probesetxrefs = probesets.get_probesetxref(probesetfreezeid) print "Get %d probesetxrefs" % (len(probesetxrefs)) # for probesetxref in probesetxrefs: # probesetid = probesetxref[0] probesetdataid = probesetxref[1] probeset = probesets.get_probeset(probesetid) probesetname = probeset[1] probesetdata = probesets.get_probesetdata(probesetdataid) probesetdata = zip(*probesetdata) probesetdata = utilities.to_dic([strain.lower() for strain in probesetdata[1]], probesetdata[2]) # for geno in genos: genoname = geno['locus'] outputfile.write("%s\t" % probesetid) outputfile.write("%s\t" % probesetname) outputfile.write("%s\t" % genoname) # dic1 = geno['dicvalues'] dic2 = probesetdata keys, values1, values2 = utilities.overlap(dic1, dic2) rs = calculate.correlation(values1, values2) # outputfile.write("%s\t" % len(keys)) outputfile.write("%s\t" % rs[0][0]) outputfile.write("%s\t" % rs[0][1]) outputfile.write("%s\t" % rs[1][0]) outputfile.write("%s\t" % rs[1][1]) outputfile.write("\n") outputfile.flush() # outputfile.close()
t_npm = array([t1_npm, t2_npm]).transpose() # Identify dwells with momentum unloads print('filtering dumps, nsm/ssm events, short dwells, perigees, and outliers...') aounload = fetch.Msid('AOUNLOAD', t_start, t_stop) dump = aounload.vals != 'MON ' if any(dump[:1]) | any(dump[-2:]): raise StandardError('Timeframe must not start or end with a momentum dump.') i1_dump = ~dump[:-1] & dump[1:] i2_dump = dump[:-1] & ~dump[1:] if sum(i1_dump) != sum(i2_dump): raise StandardError('Dump start and stop times do not correlate.') t1_dump = aounload.times[nonzero(i1_dump)[0] + 1] t2_dump = aounload.times[nonzero(i2_dump)[0] + 1] t_dump = array([t1_dump, t2_dump]).transpose() bad_dump = overlap(t_npm, t_dump) # Identify dwells during NSM and SSM events t_nsm = str_to_secs(nsm) bad_nsm = overlap(t_npm, t_nsm) t_ssm = str_to_secs(ssm) bad_ssm = overlap(t_npm, t_ssm) # Identify dwells that are too short for accurate reading bad_short = (t_npm[:,1] - t_npm[:,0]) < min_dur # Identify dwells with low altitude (gravity gradient torques will dominate) i1_npm_ind = nonzero(i1_npm)[0] i2_npm_ind = nonzero(i2_npm)[0] min_dwell_alt = array([min(x['DIST_SATEARTH'].vals[i1_npm_ind[i]:i2_npm_ind[i]]) for i in range(len(i1_npm_ind))]) bad_low = min_dwell_alt < min_alt
]].values is_val = np.full(n_frg, fill_value=True) fi = 0 # TODO: better merging strategy is to keep top MQs, but that requires pairwise comparison of all fragments => expensive while fi < n_frg - 1: if fi % 1e6 == 0: print('\t{:12,d} fragments are checked for overlap, to be merged.'. format(fi)) if (frg_np[fi, 0] != frg_np[fi + 1, 0]) or (frg_np[fi, 1] != frg_np[fi + 1, 1]): fi += 1 continue # check overlap (ignoring strand) fi_be = fi while overlap(frg_np[fi_be, 2:4], frg_np[fi + 1:fi + 2, 2:4])[0]: fi += 1 if fi == n_frg - 1: break if fi_be != fi: # bam_pd.loc[fi_be:fi] # frg_np[fi_be:fi + 1, :] frg_np[fi_be, 2] = np.min(frg_np[fi_be:fi + 1, 2]) frg_np[fi_be, 3] = np.max(frg_np[fi_be:fi + 1, 3]) frg_np[fi_be, 4] = np.max(frg_np[fi_be:fi + 1, 4]) frg_np[fi_be, 5] = fi - fi_be is_val[fi_be + 1:fi + 1] = False fi += 1 print('\t{:,d} overlapping fragments are merged.'.format(np.sum(~is_val))) bam_pd[['map_start', 'map_end', 'mq', 'map_#merge']] = frg_np[:, 2:6] bam_pd = bam_pd.loc[is_val].reset_index(drop=True)
with pysam.AlignmentFile(inp_args.input_bam, 'rb') as src_fid: # hint: no need to check continuity (uniqueness) of the read_ids, we will do this on the make_dataset script: better use of memory for rd_idx, read in enumerate(get_read(src_fid)): if rd_idx % 1e6 == 0: print('\t{:,d} reads are processed'.format(rd_idx)) n_read += 1 # check overlap with probes/VPs hit_vps = {} hit_overlap_size = np.zeros(n_expr + 1, dtype=int) for frg in read: frg_crd = [ chr2nid[frg.reference_name], frg.reference_start, frg.reference_end ] is_ol = overlap(frg_crd, vp_crds) if any(is_ol): vp_idx = np.where(is_ol)[0] assert len( vp_idx ) == 1, '[e] A single fragment is mapped to multiple viewpoints!' vp_idx = vp_idx[0] hit_overlap_size[vp_idx] += frg.get_overlap( vp_crds[vp_idx, 1], vp_crds[vp_idx, 2]) if vp_idx not in hit_vps: # coloring is based on the first fragment that maps to the VP clr_ratio = float( np.mean(frg_crd[1:]) - vp_crds[vp_idx, 1]) / ( vp_crds[vp_idx, 2] - vp_crds[vp_idx, 1]) hit_vps[vp_idx] = { 'color':
sig_pd = bin_pd.loc[is_sig].reset_index(drop=True) print('#bins: {:4d} loaded, {:4d} enriched '.format( bin_pd.shape[0], sig_pd.shape[0]), end='') del bin_pd # bin_pd.loc[is_sig] bin_pd.loc[~is_sig] if len(sig_pd) == 0: print() continue # marking neighbor bins enrich_crd = sig_pd[['chr', 'pos', 'pos']].values enrich_crd[:, 2] += bin_width nei_idxs = np.arange(len(sig_pd)) for ci in range(len(sig_pd)): has_ol = overlap(enrich_crd[ci], enrich_crd, offset=inp_args.neighborhood_width) if np.sum(has_ol) > 1: is_sel = np.isin(nei_idxs, nei_idxs[has_ol]) # sig_pd.loc[has_ol] nei_idxs[is_sel] = np.min( nei_idxs[is_sel]) # sig_pd.loc[is_sel] sig_pd['nei_idx'] = nei_idxs del enrich_crd # merging neighbor bins sig_pd = sig_pd.sort_values(by='#cpt_zscr', ascending=False).reset_index(drop=True) nei_grp = sig_pd.groupby(by='nei_idx', sort=False) for rank_idx, (nei_idx, nei_pd) in enumerate(nei_grp): itm_crd = [
# bin_info['cmb_zscr'] = np.maximum(norm.ppf(1 - bin_info['cmb_qval']), 0) # correct p-values for multiple testing # TODO: Correction factor for cis-bins is too strong; there are not many #background to reach small p-values # bin_info['#cpt_qval'] = np.minimum(bin_info['#cpt_pval'] * bin_info.shape[0], 1) # bin_info['cmb_qval'] = np.minimum(bin_info['cmb_pval'] * bin_info.shape[0], 1) #################################################################################################################### # Output all windows if inp_args.store_all_enrichments: os.makedirs(os.path.dirname(out_fpath_all), exist_ok=True) bin_info.to_csv(out_fpath_all, sep='\t', na_rep='nan', index=False, compression='gzip') print('All bins scores are saved to: {:s}'.format(out_fpath_all)) # Output top windows is_roi = overlap([vp_info['vp_chr'], vp_info['vp_be'], vp_info['vp_en']], bin_info[['chr', 'pos', 'pos']].values, offset=inp_args.roi_width) is_enriched = bin_info['#cpt_zscr'] >= 5.0 bin_nroi = bin_info.loc[(~is_roi) & is_enriched].sort_values(by='#cpt_zscr', ascending=False).reset_index(drop=True) del is_roi, is_enriched os.makedirs(os.path.dirname(out_fpath_top), exist_ok=True) bin_nroi.to_csv(out_fpath_top, sep='\t', na_rep='nan', index=False) print('{:d} bins with elevated #captures (i.e., z-score >= 5.0) are stored in: {:s}'.format(len(bin_nroi), out_fpath_top)) # assert bin_nroi['#cpt_zscr'].iat[-1] < 8.0, 'Some "top bins" could be cropped, increase #top_bins that are stored (current={:d}).'.format(inp_args.n_topbins) # Plotting if inp_args.draw_plot: plt.figure(figsize=(25, 7)) ax_h = plt.gca() ax_h.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: '{:,.0f}'.format(x))) # Plot important areas
def bxd_geno_pheno_correlations(file): # file = open(file, 'w') inbredsetid = 1 genofile = "/home/leiyan/gn/web/genotypes/BXD.geno" # t = genotypes.load_genos(genofile) genostrains = t[0] genos = t[1] print("From geno file, get %d strains" % (len(genostrains))) print("From geno file, get %d genos" % (len(genos))) # publishxrefs = phenotypes.get_publishxrefs(inbredsetid) print("get %d publishxrefs" % (len(publishxrefs))) # file.write("%s\t" % "PhenotypeID") file.write("%s\t" % "PhenotypeName") file.write("%s\t" % "MarkerName") file.write("%s\t" % "MarkerChromosome") file.write("%s\t" % "MarkerCentimorgan") file.write("%s\t" % "MarkerMb") file.write("%s\t" % "PearsonCorrelation") file.write("%s\t" % "PearsonPvalue") file.write("%s\t" % "SpearmanCorrelation") file.write("%s\t" % "SpearmanPvalue") file.write("%s\t" % "Number_of_BXDs_used") file.write("\n") file.flush() # for publishxref in publishxrefs: # publishxrefid = publishxref[0] phenotypeid = publishxref[1] phenotype = phenotypes.get_phenotype(phenotypeid) publicationid = publishxref[2] publication = phenotypes.get_publication(publicationid) publishdataid = publishxref[3] publishdata = phenotypes.get_publishdata(publishdataid) publishdata = zip(*publishdata) if len(publishdata) != 3: print("publishdata - %s: %d" % (publishxrefid, len(publishdata))) continue publishdata = utilities.to_dic([strain.lower() for strain in publishdata[1]], publishdata[2]) # for geno in genos: # dic1 = geno['dicvalues'] dic2 = publishdata keys, values1, values2 = utilities.overlap(dic1, dic2) rs = calculate.correlation(values1, values2) # file.write("%s\t" % publishxrefid) file.write("%s;%s;%s\t" % (phenotype[0], phenotype[1], phenotype[2])) file.write("%s\t" % geno['locus']) file.write("%s\t" % geno['chr']) file.write("%s\t" % geno['cm']) file.write("%s\t" % geno['mb']) file.write("%s\t" % rs[0][0]) file.write("%s\t" % rs[0][1]) file.write("%s\t" % rs[1][0]) file.write("%s\t" % rs[1][1]) file.write("%s\t" % len(keys)) file.write("\n") file.flush() # file.close()
enrichments = pd.read_csv(enrichment_fpath, sep='\t') print('\t[{:2d}/{:d}] Loading enrichments in: {:s}'.format( ei + 1, len(vp_infos), enrichment_fpath)) # filtering enrichments is_sel = enrichments['bin_width'].isin(inp_args.bin_widths) enrichments = enrichments.loc[is_sel].reset_index(drop=True) enrichments = enrichments.sort_values( by=inp_args.enrichment_score, ascending=False).reset_index(drop=True) # finding overlapping calls across bin_widths/Gaussian_widths/n_steps enrich_crd = enrichments[['enrich_chr', 'enrich_beg', 'enrich_end']].values ovl_idxs = np.arange(len(enrich_crd)) for ci in range(len(enrich_crd)): has_ol = overlap(enrich_crd[ci], enrich_crd, offset=inp_args.neighborhood_width) if np.sum(has_ol) > 1: is_in = np.isin(ovl_idxs, ovl_idxs[has_ol]) # clc_pd.loc[has_ol] ovl_idxs[is_in] = np.min(ovl_idxs[is_in]) # clc_pd.loc[is_in] enrichments['ovl_idx'] = np.unique(ovl_idxs, return_inverse=True)[1] del enrich_crd, ovl_idxs # select significant calls: The overlap is determined, we dont need insignificant calls anymore is_cis = enrichments['vp_chr'] == enrichments['enrich_chr'] is_sig = (is_cis & (enrichments[inp_args.enrichment_score] >= inp_args.significance_threshold_cis)) | \ (~is_cis & (enrichments[inp_args.enrichment_score] >= inp_args.significance_threshold)) enrichments = enrichments.loc[is_sig].reset_index(drop=True) del is_cis, is_sig # combine across scales