Esempio n. 1
0
def align_list_to_bed(*, align_list):

    temp_align_list = align_list.assign(assembly_id_sequence_id=lambda x: x[
        'assembly_id'] + ',' + x['sequence_id'])
    temp_bed = BedTool.from_dataframe(temp_align_list[[
        'assembly_id_sequence_id', 'sequence_from', 'sequence_to'
    ]])

    temp_merged_bed = temp_bed.sort().merge()
    if temp_merged_bed.count() > 0:
        temp_merged_bed_df = temp_merged_bed.to_dataframe()
        temp_merged_bed_df = pandas.concat([
            temp_merged_bed_df['chrom'].str.split(
                ',', n=1, expand=True).rename(columns={
                    0: 'assembly_id',
                    1: 'sequence_id'
                }), temp_merged_bed_df[['start', 'end']]
        ],
                                           axis=1)
        bed = BedTool.from_dataframe(
            temp_merged_bed_df[['sequence_id', 'start', 'end', 'assembly_id']])
        os.remove(temp_merged_bed.fn)
    else:
        bed = BedTool('', from_string=True)

    os.remove(temp_bed.fn)

    return bed
def aggregate_by_tad(all_TADs_by_celltype,
                     aggregations,
                     other,
                     extension=0.1,
                     n_windows=100):
    tot_windows = n_windows + int(n_windows * extension) * 2
    tad_start_window = int(n_windows * extension)
    tad_end_window = n_windows + int(n_windows * extension)

    regions = all_TADs_by_celltype[coords + ['tad_uid']].copy()
    regions['tad_uid'] = regions.tad_uid.map(lambda x: x.replace("_", "-"))
    windows = BedTool().window_maker(b=BedTool.from_dataframe(regions)\
                                     .slop(l=extension, r=extension,
                                           pct=True, genome="hg19"),
                                     n=tot_windows, i='srcwinnum')\
                           .to_dataframe(names=coords + ['window_uid'])
    windows_idxs = windows.window_uid.str.split("_", expand=True)
    windows_idxs.columns = ['tad_uid', 'win_num']
    windows = pd.concat((windows, windows_idxs), axis=1)
    windows['win_num'] = windows['win_num'].astype(int)
    windows = windows.sort_values(coords).reset_index(drop=True)

    windows_with_ctcfs = coverage_by_window(windows, other, aggregations)
    aggregations_by_tad = {}
    for c in aggregations.keys():
        print(" " * 100, end='\r')
        print(c, end="\r")
        cagg = windows_with_ctcfs.pivot_table(index='tad_uid',
                                              columns='win_num',
                                              values=c).sort_index(axis=1)
        cagg = cagg.sort_index(axis=1)
        aggregations_by_tad[c] = cagg
    return aggregations_by_tad, tad_start_window, tad_end_window
Esempio n. 3
0
def map_signal_on_sites(ctcf_sites, sample, name, slop=0):
    ctcf_bed = BedTool.from_dataframe(ctcf_sites)
    if slop > 0:
        ctcf_bed = ctcf_bed.slop(b=slop, genome='hg19')
    ctcf_on_sample = ctcf_bed.map(BedTool.from_dataframe(sample).sort(), c=4, o='max')\
                             .to_dataframe(names=ctcf_sites.columns.tolist() + [name])
    ctcf_on_sample[name] = ctcf_on_sample[name].map(lambda y: float(y) if y != '.' else 0)
    return ctcf_on_sample
Esempio n. 4
0
def windowing_by_size(centered_boundaries, window_size):    
    windows = BedTool().window_maker(b=BedTool.from_dataframe(centered_boundaries), 
                                     w=window_size, i='srcwinnum')\
                       .to_dataframe(names=centered_boundaries.columns.tolist())
    idxs = windows[centered_boundaries.columns[-1]].str.split("_", expand=True)
    tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1)
    w_nums = idxs.iloc[:, -1].astype(int) - 1
    windows[centered_boundaries.columns[-1]] = tad_ids
    windows['w_num'] = w_nums
    windows = windows.sort_values(coords).reset_index(drop=True)
    return windows
Esempio n. 5
0
def windowing_by_number(all_TADs_by_celltype, n_windows):
    windows = BedTool().window_maker(b=BedTool.from_dataframe(all_TADs_by_celltype), 
                                     n=n_windows, i='srcwinnum')\
                       .to_dataframe(names=all_TADs_by_celltype.columns.tolist())
    idxs = windows[all_TADs_by_celltype.columns[-1]].str.split("_", expand=True)
    tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1)
    w_nums = idxs.iloc[:, -1].astype(int) - 1
    windows[all_TADs_by_celltype.columns[-1]] = tad_ids
    windows['w_num'] = w_nums
    windows = windows.sort_values(coords).reset_index(drop=True)
    return windows
Esempio n. 6
0
def coverage_by_window(windows, ctcfs, agg, null=0):
    cns = []
    c = []
    o = []
    for cn,v in agg.items():
        ci = ctcfs.columns.tolist().index(cn) + 1
        c.append(ci)
        cns.append(cn)
        o.append(v)
    windows_with_ctcfs = BedTool.from_dataframe(windows).map(BedTool.from_dataframe(ctcfs), c=c, o=o, null=null)\
                                .to_dataframe(names=windows.columns.tolist() + cns)
    return windows_with_ctcfs
Esempio n. 7
0
def merge_intervals(panels):
    """Create genomic intervals to filter VCF files starting from the provided panel file(s)

    Accepts:
        panels(list) : path to one or more panel bed files

    Returns:
        merged_panels(Temp BED File): a temporary file with merged panel intervals

    """
    merged_panels = BedTool(panels[0])
    if len(panels) > 1:
        merged_panels = merged_panels.cat(*panels[1:])

    return merged_panels
Esempio n. 8
0
def enrichment(triangles_in, ctcfs, aggregations,
               centers='start',
               extended=1000*1000, window_size=10*1000,
               value_function = lambda x: x.forward if x.forward > x.reverse \
                                                         else -x.reverse):
    triangles = triangles_in.copy()

    centers = triangles[centers]
    centered_triangles = triangles.copy()
    centered_triangles['start'] = centers
    centered_triangles['end'] = centers

    centered_triangles = BedTool.from_dataframe(centered_triangles)\
                                .slop(b=extended, genome='hg19')\
                                .to_dataframe(names=centered_triangles.columns)

    centered_triangles = centered_triangles[
        centered_triangles.end - centered_triangles.start == extended * 2]
    windows = windowing_by_size(centered_triangles[coords + ['triangle_uid']],
                                window_size=window_size)

    windows_with_ctcf = coverage_by_window(windows.sort_values(coords),
                                           ctcfs.sort_values(coords),
                                           aggregations)
    windows_with_ctcf = windows_with_ctcf.merge(centered_triangles.drop(
        coords, axis=1),
                                                on='triangle_uid')
    windows_with_ctcf['value'] = windows_with_ctcf.apply(value_function,
                                                         axis=1)
    triangles_vs_ctcfs = windows_with_ctcf.pivot_table(index='triangle_uid',
                                                       columns='w_num',
                                                       values='value')
    return triangles_vs_ctcfs
Esempio n. 9
0
def genes_to_bedtool(gene_collection, hgnc_ids=None, ensembl_ids=None, build="GRCh37"):
    """Create a Bedtool object with gene coordinates from a list of genes contained in the database

    Accepts:
        hgnc_ids(list): a list of hgnc genes ids
        ensembl_ids(list): a list of ensembl gene ids
        gene_collection(pymongo.collection.Collection)
        build(str): genome build, GRCh37 or GRCh38

    Returns:
        bt(pybedtools.bedtool.BedTool): a BedTool object containing gene intervals
    """
    if not (hgnc_ids or ensembl_ids):
        return None  # No gene was specified to filter VCF file with

    query = {"build": build}
    if hgnc_ids:
        query["hgnc_id"] = {"$in": hgnc_ids}
    elif ensembl_ids:  # either HGNC or ENSEMBL IDs, not both in the query dictionary
        query["ensembl_id"] = {"$in": ensembl_ids}
    # Query database for genes coordinates
    results = gene_collection.find(query)
    # Create a string containing gene intervals to initialize a Bedtool object with
    bedtool_string = ""
    for gene in results:
        bedtool_string += (
            "\t".join([gene["chromosome"], str(gene["start"]), str(gene["end"])]) + "\n"
        )
    if bedtool_string == "":
        return None
    bt = BedTool(bedtool_string, from_string=True)
    return bt
Esempio n. 10
0
def shift_test(cons_TADs_pos, int_on_TADs, shifts):
    n_cross = cons_TADs_pos.reset_index(drop=True).copy()
    n_cross['id'] = n_cross.index
    
    res = []
    
    for shift in shifts:
        print(" "*100, end='\r')
        print("\t{}".format(shift), end='\r')
        shifted_bounds = n_cross.copy()
        pos = shifted_bounds.apply(lambda x: max(0, x.start + shift), axis=1)
        shifted_bounds['start'] = pos
        shifted_bounds['end'] = pos

        gh_with_bounds = BedTool.from_dataframe(shifted_bounds)\
                                .map(BedTool.from_dataframe(int_on_TADs), c=1, o='count', null=0, f=1)\
                                .to_dataframe(names=shifted_bounds.columns.tolist() + ['count'])
        gh_with_bounds['shift'] = shift
        res.append(gh_with_bounds)
    return pd.concat(res, axis=0, ignore_index=True)
Esempio n. 11
0
def _compute_intersections(vcf_file, filter):
    """Create a temporary file with the gene panel intervals

    Accepts:
        vcf_file(str): path to the VCF file
        filter(BcfTool object)

    Returns:
        intersections()
    """

    vcf_bed = BedTool(vcf_file)
    LOG.info(
        "Extracting %s intervals from the %s total entries of the VCF file.",
        filter.count(),
        vcf_bed.count(),
    )
    intersections = vcf_bed.intersect(filter, header=True)
    intersected_vars = intersections.count()
    LOG.info("Number of variants found in the intervals:%s", intersected_vars)

    return intersections
Esempio n. 12
0
def cluster_boundary_positions(all_boundary_positions, window):
    all_boundary_extended = BedTool.from_dataframe(all_boundary_positions)\
                                    .slop(r=int(window/2), l=int(window/2), genome='hg19')
    bound_pos_VS_bound_pos = all_boundary_extended.intersect(
        all_boundary_extended, wa=True, wb=True, loj=True)
    bound_pos_VS_bound_pos = bound_pos_VS_bound_pos.to_dataframe(
                                    names=["b1_" + x for x in all_boundary_positions.columns] + \
                                          ["b2_" + x for x in all_boundary_positions.columns])
    bound_pos_VS_bound_pos = bound_pos_VS_bound_pos[
        (bound_pos_VS_bound_pos.b1_boundary_uid !=
         bound_pos_VS_bound_pos.b2_boundary_uid)
        & (bound_pos_VS_bound_pos.b1_cell_type !=
           bound_pos_VS_bound_pos.b2_cell_type)]
    bound_pos_G = nx.from_pandas_edgelist(
        bound_pos_VS_bound_pos[['b1_boundary_uid', 'b2_boundary_uid']],
        source='b1_boundary_uid',
        target='b2_boundary_uid',
        create_using=nx.Graph)
    bound_pos_G.add_nodes_from(all_boundary_positions.boundary_uid)
    bound_communities = best_partition(bound_pos_G)
    bound_communities = pd.Series(bound_communities).to_frame(name='cluster')\
                          .reset_index().rename(columns={'index': 'boundary_uid'})
    return all_boundary_positions.merge(bound_communities)
Esempio n. 13
0
def get_epi_features(conserved_tads,
                     gr_peaks,
                     epigenetics,
                     id_name='triangle_uid'):
    tads_with_gr = BedTool.from_dataframe(conserved_tads).sort()\
                          .intersect(BedTool.from_dataframe(gr_peaks).sort(), wa=True, wb=True)\
                          .to_dataframe(names=conserved_tads.columns.map(lambda x: 'TAD_' + x).tolist() +\
                                        gr_peaks.columns.map(lambda x: 'GR_' + x).tolist())
    tads_with_gr = tads_with_gr.groupby('TAD_{}'.format(id_name))['GR_peak_id'].count()\
                        .reindex(conserved_tads[id_name].values)\
                        .fillna(0).astype(int).to_frame('n_GR_peaks')
    tads_with_gr.index.name = id_name


    tads_with_epi = BedTool.from_dataframe(conserved_tads)\
                           .map(BedTool.from_dataframe(epigenetics[coords].sort_values(coords)),
                                c=3, o='count', null=0)\
                            .to_dataframe(names=conserved_tads.columns.tolist() + ['all_epi'])
    tads_with_epi = BedTool.from_dataframe(tads_with_epi)\
                           .map(BedTool.from_dataframe(epigenetic_marks),
                                c=[7,8,9,10], o=['sum', 'sum', 'sum', 'sum'], null=0)\
                           .to_dataframe(names=tads_with_epi.columns.tolist() + \
                                         epigenetic_marks.columns[6:].tolist())
    tads_with_epi = tads_with_epi.set_index(id_name)\
                                 .drop(coords + ['triangle_type', 'side', 'length'], axis=1)
    tads_with_epi[
        'epi_TA_UP'] = tads_with_epi.TA_UP_h3k27ac + tads_with_epi.TA_UP_h3k4me3
    tads_with_epi[
        'epi_TA_DOWN'] = tads_with_epi.TA_DOWN_h3k27ac + tads_with_epi.TA_DOWN_h3k4me3

    tads_with_features = conserved_tads.merge(tads_with_epi,
                                              left_on=id_name,
                                              right_index=True)
    tads_with_features = tads_with_features.merge(tads_with_gr,
                                                  left_index=True,
                                                  right_index=True)
    return tads_with_features
Esempio n. 14
0
def cluster_ctcf_sites(ctcfs, distance):
    return BedTool.from_dataframe(ctcfs).cluster(d=distance).to_dataframe(names=ctcfs.columns.tolist() + ['cluster'])
consensus_boundaries = pd.read_csv(interim_data_path /
                                   "consensus_boundaries.tsv",
                                   sep="\t")
consensus_boundaries['boundary_uid'] = consensus_boundaries.index

extended = 250 * 1000
window_size = 5 * 1000

centered_boundaries = consensus_boundaries.copy()
centers = ((centered_boundaries.start + centered_boundaries.end) /
           2).astype(int)
centered_boundaries['start'] = centers
centered_boundaries['end'] = centers

centered_boundaries = BedTool.from_dataframe(centered_boundaries)\
                             .slop(b=extended, genome='hg19')\
                             .to_dataframe(names=centered_boundaries.columns)

centered_boundaries = centered_boundaries[
    centered_boundaries.end - centered_boundaries.start == extended * 2]
windows = windowing_by_size(centered_boundaries[coords + ['boundary_uid']],
                            window_size=window_size)

windows_with_ctcf = coverage_by_window(windows, ctcfs, aggregations)
windows_with_ctcf = windows_with_ctcf.merge(consensus_boundaries.drop(coords,
                                                                      axis=1),
                                            on='boundary_uid')

aggregations_by_bound = {}
for nc in sorted(windows_with_ctcf.n_cell_types.unique()):
    print(" " * 100, end='\r')
hiccups['chr1'] = 'chr' + hiccups.chr1.astype(str)
hiccups['chr2'] = 'chr' + hiccups.chr2.astype(str)

hiccups['loop_id'] = hiccups.index
source_anchors = hiccups[['chr1', 'x1', 'x2']].copy()
source_anchors = source_anchors.drop_duplicates().sort_values(
    ['chr1', 'x1', 'x2']).reset_index(drop=True)
print("\tSource anchors: {}".format(source_anchors.shape[0]))
target_anchors = hiccups[['chr2', 'y1', 'y2']].copy()
target_anchors = target_anchors.drop_duplicates().sort_values(
    ['chr2', 'y1', 'y2']).reset_index(drop=True)
print("\tTarget anchors: {}".format(target_anchors.shape[0]))

source_names = source_anchors.columns.tolist() + ctcf_scores.columns.tolist()
source_anchors_to_ctcfs = BedTool.from_dataframe(source_anchors)\
                                    .intersect(BedTool.from_dataframe(ctcf_scores), wa=True, wb=True)\
                                    .to_dataframe(names=source_names)
source_anchors_to_ctcfs = source_anchors_to_ctcfs[[
    'chr1', 'x1', 'x2', 'ctcf_id', 'orientation'
]]
source_anchors_to_ctcfs.columns = [
    'chr1', 'x1', 'x2', 'left_id', 'left_orientation'
]

target_names = target_anchors.columns.tolist() + ctcf_scores.columns.tolist()
target_anchors_to_ctcfs = BedTool.from_dataframe(target_anchors)\
                                    .intersect(BedTool.from_dataframe(ctcf_scores), wa=True, wb=True)\
                                    .to_dataframe(names=target_names)
target_anchors_to_ctcfs = target_anchors_to_ctcfs[[
    'chr2', 'y1', 'y2', 'ctcf_id', 'orientation'
]]
Esempio n. 17
0
di_centers = ins_with_neigh[ins_with_neigh.point_of_interest ==
                            'tad_center'].copy()
di_centers = di_centers.merge(tads[['chr', 'center_start', 'center_end']],
                              left_on=coords,
                              right_on=['chr', 'center_start', 'center_end'])
di_centers['di_center_uid'] = np.arange(di_centers.shape[0], dtype=int)

extended = 250 * 1000
window_size = 5 * 1000

centered_di = di_centers.copy()
centers = centered_di.start
centered_di['start'] = centers
centered_di['end'] = centers

centered_di = BedTool.from_dataframe(centered_di).slop(b=extended, genome='hg19')\
                    .to_dataframe(names=centered_di.columns)
centered_di = centered_di[centered_di.end - centered_di.start == extended * 2]
windows = windowing_by_size(centered_di[coords + ['di_center_uid']],
                            window_size=window_size)

windows_with_ctcf = coverage_by_window(windows, ctcfs, aggregations)
windows_with_ctcf = windows_with_ctcf.merge(di_centers.drop(coords, axis=1),
                                            on='di_center_uid')

aggregations_by_tad_center_tot = {}
for c in aggregations.keys():
    print(" " * 100, end='\r')
    print("\t{}".format(c), end="\r")
    cagg = windows_with_ctcf.pivot_table(index='di_center_uid',
                                         columns='w_num', values=c)\
                            .sort_index(axis=1)
Esempio n. 18
0
    hg19_total_length += chromsizes[chrom]
consensus_bounds[
    'covered_genome'] = consensus_bounds.length / hg19_total_length
plot_consensus_boundary_properties(consensus_bounds)

print("Conserved boundaries VS GM12878 boundaries")
gm12878_bounds = pd.read_csv(processed_data_path /
                             "GM12878_25kb_1Mb_boundary_strength.bed",
                             sep="\t")
gm12878_bounds.columns = coords + ['bound_strenght', 'cluster_id']
gm12878_bounds = gm12878_bounds.sort_values(coords).reset_index(drop=True)
print("GM12878 bounds:", gm12878_bounds.shape[0])

half_window = 25000
consensus_bounds_fixed = get_region_center(consensus_bounds)
consensus_bounds_fixed = BedTool.from_dataframe(consensus_bounds_fixed)\
                                    .slop(b=half_window, genome='hg19')
consensus_boundaries_fixed_gm12878 = consensus_bounds_fixed\
                                            .map(BedTool.from_dataframe(gm12878_bounds), c=1, o='count')\
                                            .to_dataframe(names=consensus_bounds.columns.tolist() + \
                                                                ['n_gm12878_bounds'])
plot_consensus_boundaries_intersection_with_GM12878(
    consensus_boundaries_fixed_gm12878)

print("Conserved boundaries VS GM12878 directionality index")

hic_measures = {}

# DI
ins = pd.read_csv(processed_data_path / "di_score_r25kb_w1Mb.txt",
                  index_col=0,
                  sep="\t")
Esempio n. 19
0
ctcfs_ms_cs['rank_score_aggregate'] = ctcfs_ms_cs.ChipSeqScore.rank() * ctcfs_ms_cs.MotifScore.rank()
ctcfs_ms_cs.to_csv(interim_data_path / "ctcf_scores.tsv", sep='\t', index=False, header=True)
plot_rank_score_distribution(ctcfs_ms_cs)
plot_chipseq_score_distribution(ctcfs_ms_cs)
plot_motif_score_distribution(ctcfs_ms_cs)

print("Distance between CTCF sites")
ctcfs = ctcfs[ctcfs.orientation != 'o'].reset_index(drop=True)
distances_between_ctcfs = ctcfs.shift(-1).start - ctcfs.end
distances_between_ctcfs = distances_between_ctcfs[(distances_between_ctcfs > 0) & (distances_between_ctcfs < 1e7)]

print("Distance between Shuffled CTCF sites")
gaps = get_gaps()
shuffled_ctcf_sites = BedTool.from_dataframe(ctcfs)\
                             .shuffle(genome='hg19',
                                      chrom=True, 
                                      noOverlapping=True,
                                      excl=BedTool.from_dataframe(gaps).sort().fn)\
                             .sort().to_dataframe(names=ctcfs.columns.tolist())
distances_between_shuffled_ctcfs = shuffled_ctcf_sites.shift(-1).start - shuffled_ctcf_sites.end
distances_between_shuffled_ctcfs = distances_between_shuffled_ctcfs[(distances_between_shuffled_ctcfs > 0) & (distances_between_shuffled_ctcfs < 1e7)]


fig = plt.figure()
sns.distplot(distances_between_shuffled_ctcfs.map(np.log10), label='random', hist=False)
ax = sns.distplot(distances_between_ctcfs.map(np.log10), label='real', hist=False)
pvalue = mannwhitneyu(distances_between_ctcfs, distances_between_shuffled_ctcfs, alternative='two-sided').pvalue
plt.text(0.2,0.7, "p-value = {}".format(pvalue), transform=ax.transAxes)
xticks, _ = plt.xticks()
plt.xticks(xticks, labels=["$10^{{{}}}$".format(int(x)) for x in xticks])
plt.xlabel("Distance between adjacent CTCF sites (bp)")
plt.ylabel("Density")
Esempio n. 20
0
def get_consensus_tads(sel_bounds, gaps):
    consensus_tads = BedTool.from_dataframe(sel_bounds[coords])\
                            .complement(genome='hg19')\
                            .subtract(BedTool.from_dataframe(gaps))\
                            .to_dataframe(names=coords)
    return consensus_tads