コード例 #1
0
def get_nonredundant_example_idx(ranges, width=200):
    """Get non - overlapping intervals(in the central region)

    Args:
      ranges: pandas.DataFrame returned by basepair.cli.modisco.load_ranges
      width: central region considered that should not overlap between
         any interval
    """
    from pybedtools import BedTool
    from basepair.preproc import resize_interval
    # 1. resize ranges
    ranges['example_idx'] = np.arange(len(ranges))  # make sure
    r = ranges[['chrom', 'start', 'end',
                'example_idx']]  # add also the strand information
    r = resize_interval(r, width, ignore_strand=True)

    bt = BedTool.from_dataframe(r)
    btm = bt.sort().merge()
    df = btm.to_dataframe()
    df = df[(df.end - df.start) < width * 2]

    r_overlaps = bt.intersect(BedTool.from_dataframe(df),
                              wb=True).to_dataframe()
    keep_idx = r_overlaps.drop_duplicates(['score', 'strand',
                                           'thickStart'])['name'].astype(int)

    return keep_idx
コード例 #2
0
 def get_final_modules(ma=ma,
                       a='450kannotations.bed',
                       b='lola_vignette_data/activeDHS_universe.bed',
                       include_last=False,
                       min_capsule_len=2000):
     allcpgs = ma.beta.columns.values
     df = BedTool(a).to_dataframe()
     df.iloc[:, 0] = df.iloc[:, 0].astype(str).map(
         lambda x: 'chr' + x.split('.')[0])
     df = df.set_index('name').loc[list(
         ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
     df_bed = pd.read_table(b, header=None)
     df_bed['features'] = np.arange(df_bed.shape[0])
     df_bed = df_bed.iloc[:, [0, 1, 2, -1]]
     b = BedTool.from_dataframe(df)
     a = BedTool.from_dataframe(
         df_bed)  #('lola_vignette_data/activeDHS_universe.bed')
     c = a.intersect(b, wa=True, wb=True).sort()
     d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
     df2 = d.to_dataframe()
     df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len]
     modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values]
     modulecpgs = np.array(
         list(set(list(reduce(lambda x, y: x + y, modules)))))
     if include_last:
         missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist()
     final_modules = modules + ([missing_cpgs] if include_last else [])
     module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) +
                     '_' + df3.iloc[:, 2].astype(str)).tolist()
     return final_modules, modulecpgs, module_names
コード例 #3
0
def all_tads_present(pOriginTADs, pAccepted, pRejected):

    original_tads = pd.read_csv(pOriginTADs, sep='\t', header=None)[[0, 1, 2]]
    accepted_tads = pd.read_csv(pAccepted, sep='\t', header=None,
                                skiprows=4)[[0, 1, 2]]
    rejected_tads = pd.read_csv(pRejected, sep='\t', header=None,
                                skiprows=4)[[0, 1, 2]]

    original_tads_bedtool = BedTool.from_dataframe(original_tads)
    accepted_tads_bedtool = BedTool.from_dataframe(accepted_tads)
    rejected_tads_bedtool = BedTool.from_dataframe(rejected_tads)

    x = original_tads_bedtool.intersect(accepted_tads_bedtool,
                                        c=True).to_dataframe()
    y = original_tads_bedtool.intersect(rejected_tads_bedtool,
                                        c=True).to_dataframe()

    mask_x = x['name'] >= 1
    mask_y = y['name'] >= 1

    # print(x)
    # print(mask_x.sum())
    # print(mask_y.sum())

    return mask_x.sum() + mask_y.sum()
コード例 #4
0
def intersect_bins(bins_1: pd.DataFrame, bins_2: pd.DataFrame,
                   **bedtools_kwargs) -> pd.DataFrame:
    '''Intersects two sets of genomic intervals using bedtools intersect.

       Formats the intersection in a clearer way than pybedtool auto names.
    
    '''

    bt1 = BedTool.from_dataframe(bins_1)
    bt2 = BedTool.from_dataframe(bins_2)
    bt_intersect = bt1.intersect(bt2, **bedtools_kwargs)
    df_intersect = bt_intersect.to_dataframe(
        disable_auto_names=True,
        header=None,
        index_col=False,
        names=[
            "chrom_1",
            "start_1",
            "end_1",
            "name_1",
            "chrom_2",
            "start_2",
            "end_2",
            "name_2",
            "overlap",
        ],
    )

    return df_intersect
コード例 #5
0
def intersect_variants(cnv, sv):
    # intersects CNVs and SVs and returns dataframes of intersections

    # convert report dataframes to bedtools
    cnv_bed = BedTool.from_dataframe(cnv)
    sv_bed = BedTool.from_dataframe(sv)

    bed_cols = ['CNV_CHROM', 'CNV_START', 'CNV_END', 'CNV_SVTYPE', 'CNV_DETAILS',
                'SV_CHROM', 'SV_START', 'SV_END', 'SV_SVTYPE', 'SV_DETAILS']
    # intersect SVs and CNVs with 50% reciprocal overlap
    intersection_rec50 = cnv_bed.intersect(sv_bed, wa=True, wb=True, F=0.5,
                                           f=0.5).saveas('intersection_rec50.bed')
    intersection_rec50 = pd.read_csv('intersection_rec50.bed', sep='\t', names=bed_cols)
    # make sure CNV and SV are same variant type
    intersection_rec50 = intersection_rec50[intersection_rec50['CNV_SVTYPE']
                                            == intersection_rec50['SV_SVTYPE']]
    intersection_rec50['INTERSECTION'] = ['rec50']*len(intersection_rec50)                                        
    # intersect SVs and CNVs with any overlap
    intersection_any = cnv_bed.intersect(sv_bed, wa=True, wb=True).saveas('intersection_any.bed')
    intersection_any = pd.read_csv('intersection_any.bed', sep='\t', names=bed_cols)
    # make sure CNV and SV are same variant type
    intersection_any = intersection_any[intersection_any['CNV_SVTYPE']
                                            == intersection_any['SV_SVTYPE']]    
    intersection_any['INTERSECTION'] = ['any']*len(intersection_any)  

    intersection_all = pd.concat([intersection_rec50, intersection_any], ignore_index=True)     
    intersection_all = intersection_all.groupby(bed_cols)['INTERSECTION'].apply(','.join).reset_index()                                                               

    return intersection_all
コード例 #6
0
def exclusion_regions(blacklist_file, chip_seq_data):
    """
    This function takes as input a bound bed file (from multiGPS).
    The assumption is that the bed file reports the peak center
    For example: chr2   45  46
    It converts these peak centers into 501 base pair windows, and adds them to
    the exclusion list which will be used when constructing negative sets.
    It also adds the mm10 blacklisted windows to the exclusion list.
    Parameters:
        blacklist_file (str): Path to the blacklist file.
        chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data
    Returns:
        exclusion_windows (BedTool): A bedtools object containing all exclusion windows.
        bound_exclusion_windows (BedTool): A bedtool object containing only
        those exclusion windows where there exists a binding site.
    """
    temp_chip_file = chip_seq_data.copy()  # Doesn't modify OG array.
    temp_chip_file['start'] = temp_chip_file['start'] - 250
    temp_chip_file['end'] = temp_chip_file['end'] + 250

    if blacklist_file is None:
        print('No blacklist file specified ...')
        exclusion_windows = BedTool.from_dataframe(
            temp_chip_file[['chr', 'start', 'end']])
    else:
        bound_exclusion_windows = BedTool.from_dataframe(
            temp_chip_file[['chr', 'start', 'end']])
        blacklist_exclusion_windows = BedTool(blacklist_file)
        exclusion_windows = BedTool.cat(
            *[blacklist_exclusion_windows, bound_exclusion_windows])
    return exclusion_windows
コード例 #7
0
def map_ID2(motifs, geneIDtable, output_bed):
    """function to rename the TF column values in a motifs.bed file to the Arabidopsis gene ID nomenclature using geneIDtable. (for DAP-seq cistrome motifs only). Outputs a bed file."""
    # remove '_m' from end of name_rep value in motifs
    motifs.name_rep = motifs.name_rep.str.replace("_m1", "")
    merged = pd.merge(motifs, geneIDtable, on="name_rep")
    # print(merged.shape)
    # make bed file
    sorted_motifs = merged.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_motifs).saveas(output_bed)
コード例 #8
0
def overlapTAD(pDataFrameTAD, pDataFrameProtein):
    tad_bedtool_x = BedTool.from_dataframe(pDataFrameTAD)

    log.debug('tad_bedtool_x {}'.format(tad_bedtool_x))

    protein_bedtool = BedTool.from_dataframe(pDataFrameProtein)
    x = tad_bedtool_x.intersect(protein_bedtool, c=True).to_dataframe()

    mask_x = x['name'] >= 1
    return pd.Series(mask_x)
コード例 #9
0
def annotate_peaks(peaks, ref_path):
    """
    peak to gene annotation strategy:
        1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak
        2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak
        3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak
            This step is optional
        4. call it an intergenic peak
    """

    ref_mgr = ReferenceManager(ref_path)
    tss = BedTool(ref_mgr.tss_track)

    # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites
    if tss.field_count() == 7:
        tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
    else:
        df_tss = tss.to_dataframe()
        df_tss['gene_type'] = '.'
        tss_filtered = BedTool.from_dataframe(df_tss).saveas()

    # including transcripts.bed is optional
    if ref_mgr.transcripts_track is None:
        transcripts_filtered = BedTool([])
    else:
        transcripts = BedTool(ref_mgr.transcripts_track)
        if transcripts.field_count() == 7:
            transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
        else:
            df_tx = transcripts.to_dataframe()
            df_tx['gene_type'] = '.'
            transcripts_filtered = BedTool.from_dataframe(df_tx).saveas()

    # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns
    peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas()

    results = []
    peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas()

    # avoid error when no peaks overlap with any transcipts
    if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss):
        peaks_nearby_tss_and_tx = peaks_nearby_tss \
            .intersect(transcripts_filtered, wa=True, wb=True) \
            .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct'])

        for peak in peaks_nearby_tss_and_tx:
            results.append(get_peak_nearby_genes(peak))

    for peak in peaks_nearby_tss_butno_tx:
        results.append(get_peak_nearby_genes(peak))

    return results
コード例 #10
0
def preprocess_bam_to_bed(bam, output):
    '''
    Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are 
    '''
		# convert bam to bed
		vprint("Converting bam to bed...")
		bam = BedTool(bam)
		bed = bam.bam_to_bed()

		# filter intervals
		vprint("Filter reads by size...")
		bed_chunk_iter = bed.to_dataframe(chunksize=10000000)  # chunk large file
		chunks = []
		for chunk in bed_chunk_iter:
				keep = (
						chunk[["start", "end"]]
						.swifter.progress_bar(enable=True, desc=bam)
						.apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1)
				)

				chunks.append(chunk[keep])

		bed_df = pd.concat(chunks)

		# 101bp interval for input
		vprint("Define 101bp intervals...")
		bed_df["end"] = (
				bed_df["start"].swifter.progress_bar(
						enable=True).apply(define_interval)
		)
		bed_df["name"] = "-"

		# remove duplicates
		vprint("Drop duplicate intervals...")
		bed_df.drop_duplicates(inplace=True)

		# TODO extraneous chromosomes?
		vprint("Remove extra chromosomes...")
		chromosomes = list(range(1, 23))
		chromosomes.append('X')
		chromosomes.append('Y')
		chromosomes = [f'chr{c}' for c in chromosomes]
		bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)]

		# Save result
		vprint(f"Saving {bed_df.shape[0]} intervals...")
		BedTool.from_dataframe(bed_df).moveto(output)

		# cleanup tmp files
		pybedtools.cleanup(remove_all=True)

		vprint("Done.")
コード例 #11
0
ファイル: callhub.py プロジェクト: lux563624348/HiC_Hubs
def annotate_graph_with_feature_values(_input_graph, graph_name_col2bed,
                                       path_feature, feature_name,
                                       _feature_score):
    input_graph = _input_graph
    name_col2bed = graph_name_col2bed  ## Default "name"
    Vs_Attrs_Name = feature_name  ## such as 'Tcf1'
    if (Vs_Attrs_Name not in _input_graph.vs.attributes()):
        ## Convert vs to bed format in order to annotate
        df_vs_bed = convert_vs2bed(input_graph, name_col2bed)
        ### df_vs_bed to be annoted
        Feature_vs = BedTool.from_dataframe(df_vs_bed).sort()

        PATH_Feature_A = path_feature  ##
        df_A = pd.read_csv(PATH_Feature_A, sep="\t")
        Feature_A = BedTool.from_dataframe(df_A).sort()

        ## annotate A in vs
        Feature_vs_with_A = Feature_vs.intersect(Feature_A, wb=True, F=0.3)

        if (len(Feature_vs_with_A) > 0):
            df_vs_with_A = pd.read_csv(Feature_vs_with_A.fn,
                                       sep="\t",
                                       names=df_vs_bed.columns.append(
                                           df_A.columns).values,
                                       header=None)
        else:
            df_vs_with_A = pd.DataFrame(
                columns=df_vs_bed.columns.append(df_A.columns))

        vs_score = _feature_score  ## 'such as logFC'
        vs_attrs_score = Vs_Attrs_Name + '_' + vs_score
        input_graph.vs[Vs_Attrs_Name] = 0
        input_graph.vs[vs_attrs_score] = 0
        for df_vs in df_vs_with_A.groupby(
                name_col2bed):  ### Default Define vertex attribute "name"
            input_graph.vs.select(
                name=df_vs[0])[Vs_Attrs_Name] = df_vs[1].shape[0]
            ### max Tcf1 binding
            if (type(df_vs[1].loc[:, vs_score].head(1).values[0]) == str):
                input_graph.vs.select(
                    name=df_vs[0]
                )[vs_attrs_score] = df_vs[1].loc[:, vs_score].max()
            else:
                #print(df_vs[1].loc[:,vs_score])
                input_graph.vs.select(
                    name=df_vs[0]
                )[vs_attrs_score] = df_vs[1].loc[:, vs_score].mean()
        print("Annotate " + Vs_Attrs_Name + " is finished.")
    else:
        print("Feature of " + Vs_Attrs_Name + " is already annoated. Skip.")

    return input_graph
コード例 #12
0
ファイル: region_ds.py プロジェクト: jksr/ALLCools
    def get_bed(
        self,
        with_id=True,
        bedtools=False,
        slop=None,
        chrom_size_path=None,
        standardize_length=None,
    ):
        if chrom_size_path is None:
            chrom_size_path = self.chrom_size_path  # will be none if not exist

        region_dim = self.region_dim

        bed_df = pd.DataFrame({
            "chrom": self.coords[f"{region_dim}_chrom"],
            "start": self.coords[f"{region_dim}_start"],
            "end": self.coords[f"{region_dim}_end"],
        })

        # standardize region length, used in motif enrichment analysis
        if standardize_length is not None:
            # standardize_length is an int number
            region_center = bed_df["start"] + (bed_df["end"] -
                                               bed_df["start"]) // 2
            bed_df["start"] = region_center - 1
            bed_df["end"] = region_center
            slop = (
                standardize_length // 2
            )  # use the bedtools slop to extend the center to standard length

        if with_id:
            bed_df["name"] = self.get_index(region_dim).tolist()

        bed = None
        if slop is not None and slop > 0:
            if chrom_size_path is None:
                raise ValueError(
                    "Must provide chrom_size_path when slop is not None.")
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                bed = BedTool.from_dataframe(bed_df).slop(b=slop,
                                                          g=chrom_size_path)
            if not bedtools:
                bed_df = bed.to_dataframe()

        if bedtools:
            if bed is None:
                bed = BedTool.from_dataframe(bed_df)
            return bed
        else:
            bed_df.index = self.get_index(self.region_dim)
            return bed_df
コード例 #13
0
def get_binned_modules(ma=None,
                       a=annotations450,
                       b='lola_vignette_data/activeDHS_universe.bed',
                       include_last=False,
                       min_capsule_len=2000):
    allcpgs = ma.beta.columns.values
    a = BedTool(a)
    b = BedTool(b)
    # a.saveas('a.bed')
    # b.saveas('b.bed')
    a_orig = a
    df = BedTool(a).to_dataframe()
    df.iloc[:,
            0] = df.iloc[:,
                         0].astype(str)  #.map(lambda x: 'chr'+x.split('.')[0])
    df = df.set_index('name').loc[list(
        ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
    a = BedTool.from_dataframe(df)
    # df_bed=pd.read_table(b,header=None)
    # df_bed['features']=np.arange(df_bed.shape[0])
    # df_bed=df_bed.iloc[:,[0,1,2,-1]]
    # b=BedTool.from_dataframe(df)
    # a=BedTool.from_dataframe(df_bed)#('lola_vignette_data/activeDHS_universe.bed')
    df_bed = BedTool(b).to_dataframe()
    if df_bed.shape[1] < 4:
        df_bed['features'] = np.arange(df_bed.shape[0])
    b = BedTool.from_dataframe(df_bed)
    try:
        c = b.intersect(a, wa=True, wb=True).sort()
        # c.saveas('c.bed')
        d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
    except:
        df = BedTool(a_orig).to_dataframe()
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).map(
            lambda x: 'chr' + x.split('.')[0])
        df = df.set_index('name').loc[list(
            ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]]
        a = BedTool.from_dataframe(df)
        c = b.intersect(a, wa=True, wb=True).sort()
        # c.saveas('c.bed')
        d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct'))
    #d.saveas('d.bed')
    df2 = d.to_dataframe()
    df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len]
    modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values]
    modulecpgs = np.array(list(set(list(reduce(lambda x, y: x + y, modules)))))
    if include_last:
        missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist()
    final_modules = modules + ([missing_cpgs] if include_last else [])
    module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) + '_' +
                    df3.iloc[:, 2].astype(str)).tolist()
    return final_modules, modulecpgs.tolist(), module_names
コード例 #14
0
def prepare_gat(
    df,
    promoter_TATA_intersect_bed,
    TATA_box_locations,
    file_names,
    output_genecat_prefix,
    promoterpref,
    variable1_name,
    variable2_name,
):
    """prepare files for running gat analysis - outputs a workspace file containing all promoters, a variable promoter file and a constitutive promoter file"""
    # make buffer to save promoters
    buffer = io.StringIO()
    df.to_csv(buffer, sep="\t", header=None, index=False)
    buffer.seek(0)
    # select only constitutive and variable genes
    df = df[(df.gene_type == variable1_name) |
            (df.gene_type == variable2_name)]
    # reorder columns
    df_reordered = df[[
        "chr",
        "start",
        "stop",
        "gene_type",
        "strand",
        "source",
        "attributes",
        "AGI",
    ]]
    # sort by chromosome and start
    sorted_motifs = df_reordered.sort_values(["chr", "start"])
    # save bed file
    BedTool.from_dataframe(sorted_motifs).saveas(
        f"../../data/output/{file_names}/TATA/{output_genecat_prefix}_{promoterpref}_nocontrol.bed"
    )

    # run bedtools intersect between TATAbox_location_renamed.bed and the extracted promoters
    TATAlocations = BedTool(TATA_box_locations)
    promoters = BedTool(buffer)
    promoters.intersect(TATAlocations,
                        wao=True,
                        output=promoter_TATA_intersect_bed)
    # make a new gat workspace file with all promoters (first 3 columns)
    BedTool.from_dataframe(sorted_motifs[["chr", "start", "stop"]]).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_workspace.bed"
    )
    # select only variable promoters
    variable_promoters_extended = sorted_motifs[sorted_motifs["gene_type"] ==
                                                variable2_name]
    sorted_variable = variable_promoters_extended.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_variable).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable2_name}.bed"
    )
    # make a constitutive only file
    constitutive_promoters = sorted_motifs[sorted_motifs["gene_type"] ==
                                           variable1_name]
    sorted_constitutive = constitutive_promoters.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_constitutive).saveas(
        f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable1_name}.bed"
    )
コード例 #15
0
def overlapLoop(pDataFrameLoop, pDataFrameProtein):
    loop_bedtool_x = BedTool.from_dataframe(pDataFrameLoop[[0, 1, 2]])
    loop_bedtool_y = BedTool.from_dataframe(pDataFrameLoop[[3, 4, 5]])

    protein_bedtool = BedTool.from_dataframe(pDataFrameProtein)
    x = loop_bedtool_x.intersect(protein_bedtool, c=True).to_dataframe()
    y = loop_bedtool_y.intersect(protein_bedtool, c=True).to_dataframe()

    mask_x = x['name'] >= 1
    mask_y = y['name'] >= 1

    selection = (mask_x) & (mask_y)

    return selection
コード例 #16
0
ファイル: callhub.py プロジェクト: lux563624348/HiC_Hubs
def calculate_pvalue_for_hub(_PATH_interaction, _df_Hubs, _col_fore,
                             _col_back):
    ## Calculate pvalue for each hub
    PATH_interaction = _PATH_interaction
    col_fore = _col_fore
    col_back = _col_back
    df_Hub_top = convert_cluster2bed(_df_Hubs,
                                     'hub_name').reset_index().drop('index',
                                                                    axis=1)

    ## Associated each Hub with interaction and pvalue
    ########################################################################################################
    df_inter = pd.read_csv(PATH_interaction, sep="\t").fillna(0)
    df_inter = df_inter[df_inter.iloc[:, 1] != df_inter.iloc[:, 2]]
    df_inter.loc[:, '#chr'] = 'chr' + df_inter.iloc[:, 0].astype(str)
    Feature_interaction = BedTool.from_dataframe(df_inter).sort()
    Feature_hub = BedTool.from_dataframe(df_Hub_top).sort()
    ########################################################################################################
    ## calculate all interactions inside a hub
    Feature_Hub_interaction = Feature_hub.intersect(Feature_interaction,
                                                    wa=True,
                                                    wb=True,
                                                    F=1.0)
    col_name = df_Hub_top.columns.append(df_inter.columns)
    df_Feature_Hub_interaction = pd.read_csv(Feature_Hub_interaction.fn,
                                             sep='\t',
                                             names=col_name)
    df_Feature_Hub_interaction_group = df_Feature_Hub_interaction.groupby(
        'hub_name')

    ########################################################################################################
    ### calculate a pvalue for each hub
    hub_sum = []
    for hub in df_Feature_Hub_interaction_group:
        #print (hub[0])
        df_hub = hub[1]
        data_for_test = df_hub.loc[:, col_back] - df_hub.loc[:, col_fore]
        w, pvalue_hub = stats.wilcoxon(data_for_test)  #, alternative='less')
        hub_sum.append([hub[0], df_hub.Num_vertices.unique()[0], pvalue_hub])
        #break

    df_hub_summary = pd.DataFrame(
        data=hub_sum, columns=['hub_name', 'Num_vertices', 'pvalue'])
    df_hub_summary = df_Hub_top.merge(df_hub_summary,
                                      on=['hub_name', 'Num_vertices'],
                                      how='inner').sort_values(by='pvalue')

    return df_hub_summary
コード例 #17
0
def get_tss_info(peak_str_list, ref_genome, verbose=True):
    """
    Get annotation about Transcription Starting Site (TSS).

    Args:
        peak_str_list (list of str): list of peak_id. e.g., [“chr5_0930303_9499409”, “chr11_123445555_123445577”]
        ref_genome (str): reference genome name.
        verbose (bool): verbosity.
    """
    SUPPORTED_REF_GENOME_LIST = []
    for refs in SUPPORTED_REF_GENOME.values():
        SUPPORTED_REF_GENOME_LIST += refs

    if ref_genome not in SUPPORTED_REF_GENOME_LIST:
        raise ValueError(
            ref_genome,
            " is not supported currently. Supported refgenomes are ",
            SUPPORTED_REF_GENOME)

    ref = _load_tss_ref_data(ref_genome=ref_genome)

    queue = list_peakstr_to_df(peak_str_list)
    queue = BedTool.from_dataframe(queue)

    annotated = annotate_tss(tss_ref_bed=ref, queue_bed=queue, verbose=verbose)

    return annotated
コード例 #18
0
def get_fasta(intervals, reference, tab=True, s=True, name=True, mirna=False):
    """
    function extract fasta file sequences from 
    reference genome based on bed file intervals.
    return a pd dataframe of id,sequence.


    paramenters:
    intervals=pandas dataframe
    reference=path to reference genome
    tab=True
    s=True
    name=True
    """

    if not mirna:
        bed_obj = BedTool.from_dataframe(intervals)
    else:
        bed_obj = BedTool(intervals)

    ref_obj = BedTool(reference)
    a = bed_obj.sequence(fi=ref_obj, tab=tab, s=s, name=name)

    seq_tab = pd.read_csv(
        a.seqfn, header=None, names=["fasta_id", "binding_sequence"], sep="\t"
    )
    seq_tab["binding_sequence"] = seq_tab["binding_sequence"].str.upper()
    return seq_tab
コード例 #19
0
def remove_black_list_region(adata, black_list_path, f=0.2):
    """
    Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path

    Parameters
    ----------
    adata
    black_list_path
        Path to the black list bed file
    f
        Fraction of overlap when calling bedtools intersect
    Returns
    -------
    None
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        feature_bed_df = adata.var[['chrom', 'start', 'end']]
        feature_bed = BedTool.from_dataframe(feature_bed_df)
        black_list_bed = BedTool(black_list_path)
        black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True)
        black_feature_index = black_feature.to_dataframe().set_index(
            ['chrom', 'start', 'end']).index
    black_feature_id = pd.Index(feature_bed_df.reset_index().set_index(
        ['chrom', 'start', 'end']).loc[black_feature_index]['region'])

    print(f'{black_feature_id.size} features removed due to overlapping'
          f' (bedtools intersect -f {f}) with black list regions.')
    adata._inplace_subset_var(~adata.var_names.isin(black_feature_id))
    return
コード例 #20
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    lowest_resolution = args.lowestResolution

    files = args.inputFiles
    outfile_name = args.outFileName

    dataframe = None

    for file in files:
        if dataframe is None:
            dataframe = readFile(file)
        else:
            dataframe = dataframe.append(readFile(file), ignore_index=True)

    dataframe_bedtool = BedTool.from_dataframe(dataframe)
    dataframe = dataframe_bedtool.sort().to_dataframe(disable_auto_names=True,
                                                      header=None)

    dataframe.drop_duplicates(keep=False, inplace=True)

    tuples_x = [tuple(x) for x in dataframe[[0, 1, 2]].values]
    tuples_y = [tuple(x) for x in dataframe[[3, 4, 5]].values]

    result_list_index = mergeLoops(dataframe, lowest_resolution, tuples_x,
                                   tuples_y)
    result_dataframe = dataframe.iloc[sorted(result_list_index), :]
    result_dataframe.to_csv(outfile_name, sep='\t', header=False, index=False)
コード例 #21
0
ファイル: mcds.py プロジェクト: lhqing/ALLCools
    def remove_black_list_region(self, var_dim, black_list_path, f=0.2):
        """
        Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path

        Parameters
        ----------
        var_dim
            Name of var_dim
        black_list_path
            Path to the black list bed file
        f
            Fraction of overlap when calling bedtools intersect
        Returns
        -------
        MCDS
        """
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            feature_bed_df = self.get_feature_bed(var_dim=var_dim)
            feature_bed = BedTool.from_dataframe(feature_bed_df)
            black_list_bed = BedTool(black_list_path)
            black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True)
            black_feature_index = black_feature.to_dataframe().set_index(
                ['chrom', 'start', 'end']).index
        black_feature_id = pd.Index(feature_bed_df.reset_index().set_index(
            ['chrom', 'start', 'end']).loc[black_feature_index][var_dim])

        print(
            f'{black_feature_id.size} {var_dim} features removed due to overlapping'
            f' (bedtools intersect -f {f}) with black list regions.')
        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
            mcds = self.sel(
                {var_dim: ~self.get_index(var_dim).isin(black_feature_id)})
        return mcds
コード例 #22
0
    def remove_black_list_region(self, var_dim, black_list_path, f=0.2):
        """
        Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path

        Parameters
        ----------
        var_dim
            Name of var_dim
        black_list_path
            Path to the black list bed file
        f
            Fraction of overlap when calling bedtools intersect

        Returns
        -------
        MCDS (xr.Dataset)
        """
        feature_bed_df = self.get_feature_bed(var_dim=var_dim)
        feature_bed = BedTool.from_dataframe(feature_bed_df)

        black_list_bed = BedTool(black_list_path)
        black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True)
        black_feature_index = black_feature.to_dataframe().set_index(
            ['chrom', 'start', 'end']).index
        black_feature_id = pd.Index(feature_bed_df.reset_index().set_index(
            ['chrom', 'start', 'end']).loc[black_feature_index][var_dim])

        print(
            f'{black_feature_id.size} {var_dim} features removed due to overlapping'
            f' (bedtools intersect -f {f}) with black list regions.')

        mcds = self.sel(
            {var_dim: ~self.get_index(var_dim).isin(black_feature_id)})
        return mcds
コード例 #23
0
def get_mutations_MB_not_CGC():

    mapp_file = 'data/megabase_probability/hg19.mappable.1Mb.windows.bed.extra.gz'

    genes = pd.read_csv(
        'data/megabase_probability/cgc_exonic_regions.tsv',
        sep='\t',
        names=['chr', 'p1', 'p2', 'strand', 'ID1', 'ID2', 'symbol'])

    genes['CHR'] = genes['chr'].apply(lambda x: 'chr{}'.format(x))

    genes_bed = BedTool.from_dataframe(genes[['CHR', 'p1', 'p2']])

    mapp_bedtool = BedTool(mapp_file)

    mapping_no_CGC = mapp_bedtool.subtract(genes_bed, sorted=True)

    mapp_no_CGC = mapping_no_CGC.to_dataframe(names=[
        'chr',
        'start',
        'end',
        'val',
        'chr1',
        'start1',
        'end1',
        'overlapp',
        'ID',
        'real_start',
    ])

    mapp_no_CGC.to_csv('data/megabase_probability/mappable_file.nocgc.bed.gz',
                       sep='\t',
                       index=False,
                       header=False,
                       compression='gzip')
コード例 #24
0
def load_files(mutation_file, cnvs_file, purity_file):

    # load cnvs file from purple output
    if os.path.isfile(cnvs_file) is not True:
        print('CNS file {} does not exist, exiting...'.format(cnvs_file))
        sys.exit()

    # read CNS file and store it in BedTool format
    df_cns = pd.read_csv(cnvs_file, sep='\t')
    cnv_bed = BedTool.from_dataframe(
        df_cns[['#chromosome', 'start', 'end', 'copyNumber', 'baf']])

    # load purity and gender
    if os.path.isfile(purity_file) is not True:
        print('Purity file file {} does not exist'.format(purity_file))
        sys.exit()

    df_purity = pd.read_csv(purity_file, sep='\t')
    purity_score = np.float(df_purity['#Purity'].tolist()[0])
    gender = df_purity['Gender'].tolist()[0]

    # read vcf
    df = vcf_reader(mutation_file)

    # get only canonical chromosomes
    wantedchroms = [str(i) for i in range(1, 23)]
    wantedchroms.append('Y')
    wantedchroms.append('X')

    # select only variants with the PASS filter and in the chromosomes we are interested in
    df = df[df['CHROM'].isin(wantedchroms)]
    df = df[df['FILTER'] == 'PASS']

    return df, cnv_bed, purity_score, gender
コード例 #25
0
def remove_black_list_region(adata, black_list_path, f=0.2):
    """
    Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path

    Parameters
    ----------
    adata
    black_list_path
        Path to the black list bed file
    f
        Fraction of overlap when calling bedtools intersect
    Returns
    -------
    None
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        feature_bed_df = adata.var[["chrom", "start", "end"]]
        feature_bed = BedTool.from_dataframe(feature_bed_df)
        black_list_bed = BedTool(black_list_path)
        black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True)
        try:
            black_feature_index = (black_feature.to_dataframe().set_index(
                ["chrom", "start", "end"]).index)
            black_feature_id = pd.Index(feature_bed_df.reset_index().set_index(
                ["chrom", "start",
                 "end"]).loc[black_feature_index][feature_bed_df.index.name])
            print(
                f"{black_feature_id.size} features removed due to overlapping"
                f" (bedtools intersect -f {f}) with black list regions.")
            adata._inplace_subset_var(~adata.var_names.isin(black_feature_id))
        except pd.errors.EmptyDataError:
            # no overlap with black list
            pass
    return
コード例 #26
0
def chromsize2bed(chromsize, bed_file):
    chrom_df = pd.read_table(chromsize, sep="\t", header=None)
    cols = ["chr", "stop"]
    chrom_df.columns = cols
    chrom_df["start"] = 1
    chrom_df = chrom_df[["chr", "start", "stop"]]
    # add extra columns so compatible with FIMO_filter.py
    chrom_df["gene"] = chrom_df.chr
    chrom_df["dot"] = "."
    chrom_df["strand"] = "+"
    chrom_df["source"] = "manual"
    chrom_df["type"] = "promoter"
    chrom_df["dot2"] = "."
    chrom_df["details"] = "none"
    sorted_proms = chrom_df.sort_values(["chr", "start"])
    BedTool.from_dataframe(sorted_proms).saveas(bed_file)
コード例 #27
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if args.method == 'loops':
        loop_df = readLoopFile(args.data, args.addChrPrefixLoops)
        if loop_df is None:
            log.error('Empty loop file')
            return
        loop_df_bedtool = BedTool.from_dataframe(loop_df)
        loop_df = loop_df_bedtool.sort().to_dataframe(disable_auto_names=True,
                                                      header=None)

        protein_df = readProtein(args.protein, args.addChrPrefixProtein)
        if protein_df is None:
            log.error('Empty protein file')
            return
        protein_df_bedtool = BedTool.from_dataframe(protein_df)
        protein_df = protein_df_bedtool.sort().to_dataframe(
            disable_auto_names=True, header=None)

        protein_df_resolution = applyBinning(protein_df, args.resolution)

        overlap_mask_df = overlapLoop(loop_df, protein_df_resolution)
        loop_df_ = loop_df[overlap_mask_df]
        print('Protein peaks: {}'.format(len(protein_df_resolution)))
        print('Matched Loops: {}'.format(len(loop_df_)))
        print('Total Loops: {}'.format(len(loop_df)))

        print('Loops match protein: {}'.format(len(loop_df_) / len(loop_df)))

        if args.outFileName:
            loop_df_ = loop_df[overlap_mask_df]
            writeLoopFile(args.outFileName + '_matched_locations', loop_df_)

            with open(args.outFileName + '_statistics', 'w') as file:
                file.write('# HiCExplorer hicValidateLocations {}\n'.format(
                    __version__))
                file.write(
                    '# Overlap of loop file {} with protein file {}\n#\n'.
                    format(args.data, args.protein))
                file.write('Protein peaks: {}\n'.format(
                    len(protein_df_resolution)))
                file.write('Matched Loops: {}\n'.format(len(loop_df_)))
                file.write('Total Loops: {}\n'.format(len(loop_df)))
                file.write('Loops match protein: {}\n'.format(
                    len(loop_df_) / len(loop_df)))
コード例 #28
0
ファイル: cnas_ccf.py プロジェクト: oriolpich/mutfootprints
def get_CNAS_ccf(mutation_file, outpath):

    dic_cnas = load_CNAS()

    # only the name of the tumoral sample
    name = os.path.basename(mutation_file).split('_')[1].split('.')[0]

    # full name
    fullname = os.path.basename(mutation_file).split('.')[0]
    outpath = '{}/{}.cna.gz'.format(outpath, fullname)

    cna_file = dic_cnas[name]
    purity_file = dic_cnas[name].replace('.cnv', '.purity')

    # read CNAs
    df = pd.read_csv(cna_file, sep='\t')
    df['#chromosome'] = df['#chromosome'].apply(lambda x: 'chr{}'.format(x))
    df['major_cn'] = round(df['baf'] * df['copyNumber'])
    df['minor_cn'] = round((1 - df['baf']) * df['copyNumber'])
    df.replace(-0.0, 0, inplace=True)
    filtered_df = df[(df['major_cn'] >= 0) & (df['minor_cn'] >= 0)]

    # read purity and gender
    pur = pd.read_csv(purity_file, sep='\t')
    purity = pur['#Purity'].tolist()[0]
    gender = pur['Gender'].tolist()[0].lower()

    # read the mutations
    df_muts = pd.read_csv(mutation_file, sep='\t')

    df_muts['POS-1'] = df_muts['POS'] - 1
    mut_bed = BedTool.from_dataframe(df_muts[['CHROM', 'POS-1', 'POS']])
    mut_cnas = BedTool.from_dataframe(
        filtered_df[['#chromosome', 'start', 'end', 'major_cn',
                     'minor_cn']]).coverage(mut_bed, counts=True)
    dfc = mut_cnas.to_dataframe(names=[
        'chromosome', 'start', 'end', 'major_cn', 'minor_cn', 'n.snv_mnv'
    ])

    dfc['clonal_frequency'] = purity
    dfc['gender'] = gender

    dfc[[
        'chromosome', 'start', 'end', 'major_cn', 'minor_cn',
        'clonal_frequency', 'gender', 'n.snv_mnv'
    ]].to_csv(outpath, header=True, index=False, sep='\t', compression='gzip')
コード例 #29
0
def make_fasta(entry):
    df_slice = bed.iloc[[entry]]
    b = BedTool.from_dataframe(df_slice)
    b.sequence(fi=args.genome_file,
               s=strd,
               name=True,
               fo=args.outfile + '.datamatrix/temp/fastas/' +
               df_slice['name'].to_string().split('range_id_')[1] + '.fa')
コード例 #30
0
ファイル: common.py プロジェクト: diderote/chrome_chip
def load_bedtool(file):
    import pandas as pd
    from pybedtools import BedTool

    if file.endswith('.gz'):
        return BedTool.from_dataframe(pd.read_csv(file, compression='gzip', index_col=None, header=None, sep="\t").iloc[:, :3])
    else:
        return BedTool(file)