def align_list_to_bed(*, align_list): temp_align_list = align_list.assign(assembly_id_sequence_id=lambda x: x[ 'assembly_id'] + ',' + x['sequence_id']) temp_bed = BedTool.from_dataframe(temp_align_list[[ 'assembly_id_sequence_id', 'sequence_from', 'sequence_to' ]]) temp_merged_bed = temp_bed.sort().merge() if temp_merged_bed.count() > 0: temp_merged_bed_df = temp_merged_bed.to_dataframe() temp_merged_bed_df = pandas.concat([ temp_merged_bed_df['chrom'].str.split( ',', n=1, expand=True).rename(columns={ 0: 'assembly_id', 1: 'sequence_id' }), temp_merged_bed_df[['start', 'end']] ], axis=1) bed = BedTool.from_dataframe( temp_merged_bed_df[['sequence_id', 'start', 'end', 'assembly_id']]) os.remove(temp_merged_bed.fn) else: bed = BedTool('', from_string=True) os.remove(temp_bed.fn) return bed
def aggregate_by_tad(all_TADs_by_celltype, aggregations, other, extension=0.1, n_windows=100): tot_windows = n_windows + int(n_windows * extension) * 2 tad_start_window = int(n_windows * extension) tad_end_window = n_windows + int(n_windows * extension) regions = all_TADs_by_celltype[coords + ['tad_uid']].copy() regions['tad_uid'] = regions.tad_uid.map(lambda x: x.replace("_", "-")) windows = BedTool().window_maker(b=BedTool.from_dataframe(regions)\ .slop(l=extension, r=extension, pct=True, genome="hg19"), n=tot_windows, i='srcwinnum')\ .to_dataframe(names=coords + ['window_uid']) windows_idxs = windows.window_uid.str.split("_", expand=True) windows_idxs.columns = ['tad_uid', 'win_num'] windows = pd.concat((windows, windows_idxs), axis=1) windows['win_num'] = windows['win_num'].astype(int) windows = windows.sort_values(coords).reset_index(drop=True) windows_with_ctcfs = coverage_by_window(windows, other, aggregations) aggregations_by_tad = {} for c in aggregations.keys(): print(" " * 100, end='\r') print(c, end="\r") cagg = windows_with_ctcfs.pivot_table(index='tad_uid', columns='win_num', values=c).sort_index(axis=1) cagg = cagg.sort_index(axis=1) aggregations_by_tad[c] = cagg return aggregations_by_tad, tad_start_window, tad_end_window
def map_signal_on_sites(ctcf_sites, sample, name, slop=0): ctcf_bed = BedTool.from_dataframe(ctcf_sites) if slop > 0: ctcf_bed = ctcf_bed.slop(b=slop, genome='hg19') ctcf_on_sample = ctcf_bed.map(BedTool.from_dataframe(sample).sort(), c=4, o='max')\ .to_dataframe(names=ctcf_sites.columns.tolist() + [name]) ctcf_on_sample[name] = ctcf_on_sample[name].map(lambda y: float(y) if y != '.' else 0) return ctcf_on_sample
def windowing_by_size(centered_boundaries, window_size): windows = BedTool().window_maker(b=BedTool.from_dataframe(centered_boundaries), w=window_size, i='srcwinnum')\ .to_dataframe(names=centered_boundaries.columns.tolist()) idxs = windows[centered_boundaries.columns[-1]].str.split("_", expand=True) tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1) w_nums = idxs.iloc[:, -1].astype(int) - 1 windows[centered_boundaries.columns[-1]] = tad_ids windows['w_num'] = w_nums windows = windows.sort_values(coords).reset_index(drop=True) return windows
def windowing_by_number(all_TADs_by_celltype, n_windows): windows = BedTool().window_maker(b=BedTool.from_dataframe(all_TADs_by_celltype), n=n_windows, i='srcwinnum')\ .to_dataframe(names=all_TADs_by_celltype.columns.tolist()) idxs = windows[all_TADs_by_celltype.columns[-1]].str.split("_", expand=True) tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1) w_nums = idxs.iloc[:, -1].astype(int) - 1 windows[all_TADs_by_celltype.columns[-1]] = tad_ids windows['w_num'] = w_nums windows = windows.sort_values(coords).reset_index(drop=True) return windows
def coverage_by_window(windows, ctcfs, agg, null=0): cns = [] c = [] o = [] for cn,v in agg.items(): ci = ctcfs.columns.tolist().index(cn) + 1 c.append(ci) cns.append(cn) o.append(v) windows_with_ctcfs = BedTool.from_dataframe(windows).map(BedTool.from_dataframe(ctcfs), c=c, o=o, null=null)\ .to_dataframe(names=windows.columns.tolist() + cns) return windows_with_ctcfs
def merge_intervals(panels): """Create genomic intervals to filter VCF files starting from the provided panel file(s) Accepts: panels(list) : path to one or more panel bed files Returns: merged_panels(Temp BED File): a temporary file with merged panel intervals """ merged_panels = BedTool(panels[0]) if len(panels) > 1: merged_panels = merged_panels.cat(*panels[1:]) return merged_panels
def enrichment(triangles_in, ctcfs, aggregations, centers='start', extended=1000*1000, window_size=10*1000, value_function = lambda x: x.forward if x.forward > x.reverse \ else -x.reverse): triangles = triangles_in.copy() centers = triangles[centers] centered_triangles = triangles.copy() centered_triangles['start'] = centers centered_triangles['end'] = centers centered_triangles = BedTool.from_dataframe(centered_triangles)\ .slop(b=extended, genome='hg19')\ .to_dataframe(names=centered_triangles.columns) centered_triangles = centered_triangles[ centered_triangles.end - centered_triangles.start == extended * 2] windows = windowing_by_size(centered_triangles[coords + ['triangle_uid']], window_size=window_size) windows_with_ctcf = coverage_by_window(windows.sort_values(coords), ctcfs.sort_values(coords), aggregations) windows_with_ctcf = windows_with_ctcf.merge(centered_triangles.drop( coords, axis=1), on='triangle_uid') windows_with_ctcf['value'] = windows_with_ctcf.apply(value_function, axis=1) triangles_vs_ctcfs = windows_with_ctcf.pivot_table(index='triangle_uid', columns='w_num', values='value') return triangles_vs_ctcfs
def genes_to_bedtool(gene_collection, hgnc_ids=None, ensembl_ids=None, build="GRCh37"): """Create a Bedtool object with gene coordinates from a list of genes contained in the database Accepts: hgnc_ids(list): a list of hgnc genes ids ensembl_ids(list): a list of ensembl gene ids gene_collection(pymongo.collection.Collection) build(str): genome build, GRCh37 or GRCh38 Returns: bt(pybedtools.bedtool.BedTool): a BedTool object containing gene intervals """ if not (hgnc_ids or ensembl_ids): return None # No gene was specified to filter VCF file with query = {"build": build} if hgnc_ids: query["hgnc_id"] = {"$in": hgnc_ids} elif ensembl_ids: # either HGNC or ENSEMBL IDs, not both in the query dictionary query["ensembl_id"] = {"$in": ensembl_ids} # Query database for genes coordinates results = gene_collection.find(query) # Create a string containing gene intervals to initialize a Bedtool object with bedtool_string = "" for gene in results: bedtool_string += ( "\t".join([gene["chromosome"], str(gene["start"]), str(gene["end"])]) + "\n" ) if bedtool_string == "": return None bt = BedTool(bedtool_string, from_string=True) return bt
def shift_test(cons_TADs_pos, int_on_TADs, shifts): n_cross = cons_TADs_pos.reset_index(drop=True).copy() n_cross['id'] = n_cross.index res = [] for shift in shifts: print(" "*100, end='\r') print("\t{}".format(shift), end='\r') shifted_bounds = n_cross.copy() pos = shifted_bounds.apply(lambda x: max(0, x.start + shift), axis=1) shifted_bounds['start'] = pos shifted_bounds['end'] = pos gh_with_bounds = BedTool.from_dataframe(shifted_bounds)\ .map(BedTool.from_dataframe(int_on_TADs), c=1, o='count', null=0, f=1)\ .to_dataframe(names=shifted_bounds.columns.tolist() + ['count']) gh_with_bounds['shift'] = shift res.append(gh_with_bounds) return pd.concat(res, axis=0, ignore_index=True)
def _compute_intersections(vcf_file, filter): """Create a temporary file with the gene panel intervals Accepts: vcf_file(str): path to the VCF file filter(BcfTool object) Returns: intersections() """ vcf_bed = BedTool(vcf_file) LOG.info( "Extracting %s intervals from the %s total entries of the VCF file.", filter.count(), vcf_bed.count(), ) intersections = vcf_bed.intersect(filter, header=True) intersected_vars = intersections.count() LOG.info("Number of variants found in the intervals:%s", intersected_vars) return intersections
def cluster_boundary_positions(all_boundary_positions, window): all_boundary_extended = BedTool.from_dataframe(all_boundary_positions)\ .slop(r=int(window/2), l=int(window/2), genome='hg19') bound_pos_VS_bound_pos = all_boundary_extended.intersect( all_boundary_extended, wa=True, wb=True, loj=True) bound_pos_VS_bound_pos = bound_pos_VS_bound_pos.to_dataframe( names=["b1_" + x for x in all_boundary_positions.columns] + \ ["b2_" + x for x in all_boundary_positions.columns]) bound_pos_VS_bound_pos = bound_pos_VS_bound_pos[ (bound_pos_VS_bound_pos.b1_boundary_uid != bound_pos_VS_bound_pos.b2_boundary_uid) & (bound_pos_VS_bound_pos.b1_cell_type != bound_pos_VS_bound_pos.b2_cell_type)] bound_pos_G = nx.from_pandas_edgelist( bound_pos_VS_bound_pos[['b1_boundary_uid', 'b2_boundary_uid']], source='b1_boundary_uid', target='b2_boundary_uid', create_using=nx.Graph) bound_pos_G.add_nodes_from(all_boundary_positions.boundary_uid) bound_communities = best_partition(bound_pos_G) bound_communities = pd.Series(bound_communities).to_frame(name='cluster')\ .reset_index().rename(columns={'index': 'boundary_uid'}) return all_boundary_positions.merge(bound_communities)
def get_epi_features(conserved_tads, gr_peaks, epigenetics, id_name='triangle_uid'): tads_with_gr = BedTool.from_dataframe(conserved_tads).sort()\ .intersect(BedTool.from_dataframe(gr_peaks).sort(), wa=True, wb=True)\ .to_dataframe(names=conserved_tads.columns.map(lambda x: 'TAD_' + x).tolist() +\ gr_peaks.columns.map(lambda x: 'GR_' + x).tolist()) tads_with_gr = tads_with_gr.groupby('TAD_{}'.format(id_name))['GR_peak_id'].count()\ .reindex(conserved_tads[id_name].values)\ .fillna(0).astype(int).to_frame('n_GR_peaks') tads_with_gr.index.name = id_name tads_with_epi = BedTool.from_dataframe(conserved_tads)\ .map(BedTool.from_dataframe(epigenetics[coords].sort_values(coords)), c=3, o='count', null=0)\ .to_dataframe(names=conserved_tads.columns.tolist() + ['all_epi']) tads_with_epi = BedTool.from_dataframe(tads_with_epi)\ .map(BedTool.from_dataframe(epigenetic_marks), c=[7,8,9,10], o=['sum', 'sum', 'sum', 'sum'], null=0)\ .to_dataframe(names=tads_with_epi.columns.tolist() + \ epigenetic_marks.columns[6:].tolist()) tads_with_epi = tads_with_epi.set_index(id_name)\ .drop(coords + ['triangle_type', 'side', 'length'], axis=1) tads_with_epi[ 'epi_TA_UP'] = tads_with_epi.TA_UP_h3k27ac + tads_with_epi.TA_UP_h3k4me3 tads_with_epi[ 'epi_TA_DOWN'] = tads_with_epi.TA_DOWN_h3k27ac + tads_with_epi.TA_DOWN_h3k4me3 tads_with_features = conserved_tads.merge(tads_with_epi, left_on=id_name, right_index=True) tads_with_features = tads_with_features.merge(tads_with_gr, left_index=True, right_index=True) return tads_with_features
def cluster_ctcf_sites(ctcfs, distance): return BedTool.from_dataframe(ctcfs).cluster(d=distance).to_dataframe(names=ctcfs.columns.tolist() + ['cluster'])
consensus_boundaries = pd.read_csv(interim_data_path / "consensus_boundaries.tsv", sep="\t") consensus_boundaries['boundary_uid'] = consensus_boundaries.index extended = 250 * 1000 window_size = 5 * 1000 centered_boundaries = consensus_boundaries.copy() centers = ((centered_boundaries.start + centered_boundaries.end) / 2).astype(int) centered_boundaries['start'] = centers centered_boundaries['end'] = centers centered_boundaries = BedTool.from_dataframe(centered_boundaries)\ .slop(b=extended, genome='hg19')\ .to_dataframe(names=centered_boundaries.columns) centered_boundaries = centered_boundaries[ centered_boundaries.end - centered_boundaries.start == extended * 2] windows = windowing_by_size(centered_boundaries[coords + ['boundary_uid']], window_size=window_size) windows_with_ctcf = coverage_by_window(windows, ctcfs, aggregations) windows_with_ctcf = windows_with_ctcf.merge(consensus_boundaries.drop(coords, axis=1), on='boundary_uid') aggregations_by_bound = {} for nc in sorted(windows_with_ctcf.n_cell_types.unique()): print(" " * 100, end='\r')
hiccups['chr1'] = 'chr' + hiccups.chr1.astype(str) hiccups['chr2'] = 'chr' + hiccups.chr2.astype(str) hiccups['loop_id'] = hiccups.index source_anchors = hiccups[['chr1', 'x1', 'x2']].copy() source_anchors = source_anchors.drop_duplicates().sort_values( ['chr1', 'x1', 'x2']).reset_index(drop=True) print("\tSource anchors: {}".format(source_anchors.shape[0])) target_anchors = hiccups[['chr2', 'y1', 'y2']].copy() target_anchors = target_anchors.drop_duplicates().sort_values( ['chr2', 'y1', 'y2']).reset_index(drop=True) print("\tTarget anchors: {}".format(target_anchors.shape[0])) source_names = source_anchors.columns.tolist() + ctcf_scores.columns.tolist() source_anchors_to_ctcfs = BedTool.from_dataframe(source_anchors)\ .intersect(BedTool.from_dataframe(ctcf_scores), wa=True, wb=True)\ .to_dataframe(names=source_names) source_anchors_to_ctcfs = source_anchors_to_ctcfs[[ 'chr1', 'x1', 'x2', 'ctcf_id', 'orientation' ]] source_anchors_to_ctcfs.columns = [ 'chr1', 'x1', 'x2', 'left_id', 'left_orientation' ] target_names = target_anchors.columns.tolist() + ctcf_scores.columns.tolist() target_anchors_to_ctcfs = BedTool.from_dataframe(target_anchors)\ .intersect(BedTool.from_dataframe(ctcf_scores), wa=True, wb=True)\ .to_dataframe(names=target_names) target_anchors_to_ctcfs = target_anchors_to_ctcfs[[ 'chr2', 'y1', 'y2', 'ctcf_id', 'orientation' ]]
di_centers = ins_with_neigh[ins_with_neigh.point_of_interest == 'tad_center'].copy() di_centers = di_centers.merge(tads[['chr', 'center_start', 'center_end']], left_on=coords, right_on=['chr', 'center_start', 'center_end']) di_centers['di_center_uid'] = np.arange(di_centers.shape[0], dtype=int) extended = 250 * 1000 window_size = 5 * 1000 centered_di = di_centers.copy() centers = centered_di.start centered_di['start'] = centers centered_di['end'] = centers centered_di = BedTool.from_dataframe(centered_di).slop(b=extended, genome='hg19')\ .to_dataframe(names=centered_di.columns) centered_di = centered_di[centered_di.end - centered_di.start == extended * 2] windows = windowing_by_size(centered_di[coords + ['di_center_uid']], window_size=window_size) windows_with_ctcf = coverage_by_window(windows, ctcfs, aggregations) windows_with_ctcf = windows_with_ctcf.merge(di_centers.drop(coords, axis=1), on='di_center_uid') aggregations_by_tad_center_tot = {} for c in aggregations.keys(): print(" " * 100, end='\r') print("\t{}".format(c), end="\r") cagg = windows_with_ctcf.pivot_table(index='di_center_uid', columns='w_num', values=c)\ .sort_index(axis=1)
hg19_total_length += chromsizes[chrom] consensus_bounds[ 'covered_genome'] = consensus_bounds.length / hg19_total_length plot_consensus_boundary_properties(consensus_bounds) print("Conserved boundaries VS GM12878 boundaries") gm12878_bounds = pd.read_csv(processed_data_path / "GM12878_25kb_1Mb_boundary_strength.bed", sep="\t") gm12878_bounds.columns = coords + ['bound_strenght', 'cluster_id'] gm12878_bounds = gm12878_bounds.sort_values(coords).reset_index(drop=True) print("GM12878 bounds:", gm12878_bounds.shape[0]) half_window = 25000 consensus_bounds_fixed = get_region_center(consensus_bounds) consensus_bounds_fixed = BedTool.from_dataframe(consensus_bounds_fixed)\ .slop(b=half_window, genome='hg19') consensus_boundaries_fixed_gm12878 = consensus_bounds_fixed\ .map(BedTool.from_dataframe(gm12878_bounds), c=1, o='count')\ .to_dataframe(names=consensus_bounds.columns.tolist() + \ ['n_gm12878_bounds']) plot_consensus_boundaries_intersection_with_GM12878( consensus_boundaries_fixed_gm12878) print("Conserved boundaries VS GM12878 directionality index") hic_measures = {} # DI ins = pd.read_csv(processed_data_path / "di_score_r25kb_w1Mb.txt", index_col=0, sep="\t")
ctcfs_ms_cs['rank_score_aggregate'] = ctcfs_ms_cs.ChipSeqScore.rank() * ctcfs_ms_cs.MotifScore.rank() ctcfs_ms_cs.to_csv(interim_data_path / "ctcf_scores.tsv", sep='\t', index=False, header=True) plot_rank_score_distribution(ctcfs_ms_cs) plot_chipseq_score_distribution(ctcfs_ms_cs) plot_motif_score_distribution(ctcfs_ms_cs) print("Distance between CTCF sites") ctcfs = ctcfs[ctcfs.orientation != 'o'].reset_index(drop=True) distances_between_ctcfs = ctcfs.shift(-1).start - ctcfs.end distances_between_ctcfs = distances_between_ctcfs[(distances_between_ctcfs > 0) & (distances_between_ctcfs < 1e7)] print("Distance between Shuffled CTCF sites") gaps = get_gaps() shuffled_ctcf_sites = BedTool.from_dataframe(ctcfs)\ .shuffle(genome='hg19', chrom=True, noOverlapping=True, excl=BedTool.from_dataframe(gaps).sort().fn)\ .sort().to_dataframe(names=ctcfs.columns.tolist()) distances_between_shuffled_ctcfs = shuffled_ctcf_sites.shift(-1).start - shuffled_ctcf_sites.end distances_between_shuffled_ctcfs = distances_between_shuffled_ctcfs[(distances_between_shuffled_ctcfs > 0) & (distances_between_shuffled_ctcfs < 1e7)] fig = plt.figure() sns.distplot(distances_between_shuffled_ctcfs.map(np.log10), label='random', hist=False) ax = sns.distplot(distances_between_ctcfs.map(np.log10), label='real', hist=False) pvalue = mannwhitneyu(distances_between_ctcfs, distances_between_shuffled_ctcfs, alternative='two-sided').pvalue plt.text(0.2,0.7, "p-value = {}".format(pvalue), transform=ax.transAxes) xticks, _ = plt.xticks() plt.xticks(xticks, labels=["$10^{{{}}}$".format(int(x)) for x in xticks]) plt.xlabel("Distance between adjacent CTCF sites (bp)") plt.ylabel("Density")
def get_consensus_tads(sel_bounds, gaps): consensus_tads = BedTool.from_dataframe(sel_bounds[coords])\ .complement(genome='hg19')\ .subtract(BedTool.from_dataframe(gaps))\ .to_dataframe(names=coords) return consensus_tads