def get_nonredundant_example_idx(ranges, width=200): """Get non - overlapping intervals(in the central region) Args: ranges: pandas.DataFrame returned by basepair.cli.modisco.load_ranges width: central region considered that should not overlap between any interval """ from pybedtools import BedTool from basepair.preproc import resize_interval # 1. resize ranges ranges['example_idx'] = np.arange(len(ranges)) # make sure r = ranges[['chrom', 'start', 'end', 'example_idx']] # add also the strand information r = resize_interval(r, width, ignore_strand=True) bt = BedTool.from_dataframe(r) btm = bt.sort().merge() df = btm.to_dataframe() df = df[(df.end - df.start) < width * 2] r_overlaps = bt.intersect(BedTool.from_dataframe(df), wb=True).to_dataframe() keep_idx = r_overlaps.drop_duplicates(['score', 'strand', 'thickStart'])['name'].astype(int) return keep_idx
def get_final_modules(ma=ma, a='450kannotations.bed', b='lola_vignette_data/activeDHS_universe.bed', include_last=False, min_capsule_len=2000): allcpgs = ma.beta.columns.values df = BedTool(a).to_dataframe() df.iloc[:, 0] = df.iloc[:, 0].astype(str).map( lambda x: 'chr' + x.split('.')[0]) df = df.set_index('name').loc[list( ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]] df_bed = pd.read_table(b, header=None) df_bed['features'] = np.arange(df_bed.shape[0]) df_bed = df_bed.iloc[:, [0, 1, 2, -1]] b = BedTool.from_dataframe(df) a = BedTool.from_dataframe( df_bed) #('lola_vignette_data/activeDHS_universe.bed') c = a.intersect(b, wa=True, wb=True).sort() d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct')) df2 = d.to_dataframe() df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len] modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values] modulecpgs = np.array( list(set(list(reduce(lambda x, y: x + y, modules))))) if include_last: missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist() final_modules = modules + ([missing_cpgs] if include_last else []) module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) + '_' + df3.iloc[:, 2].astype(str)).tolist() return final_modules, modulecpgs, module_names
def all_tads_present(pOriginTADs, pAccepted, pRejected): original_tads = pd.read_csv(pOriginTADs, sep='\t', header=None)[[0, 1, 2]] accepted_tads = pd.read_csv(pAccepted, sep='\t', header=None, skiprows=4)[[0, 1, 2]] rejected_tads = pd.read_csv(pRejected, sep='\t', header=None, skiprows=4)[[0, 1, 2]] original_tads_bedtool = BedTool.from_dataframe(original_tads) accepted_tads_bedtool = BedTool.from_dataframe(accepted_tads) rejected_tads_bedtool = BedTool.from_dataframe(rejected_tads) x = original_tads_bedtool.intersect(accepted_tads_bedtool, c=True).to_dataframe() y = original_tads_bedtool.intersect(rejected_tads_bedtool, c=True).to_dataframe() mask_x = x['name'] >= 1 mask_y = y['name'] >= 1 # print(x) # print(mask_x.sum()) # print(mask_y.sum()) return mask_x.sum() + mask_y.sum()
def intersect_bins(bins_1: pd.DataFrame, bins_2: pd.DataFrame, **bedtools_kwargs) -> pd.DataFrame: '''Intersects two sets of genomic intervals using bedtools intersect. Formats the intersection in a clearer way than pybedtool auto names. ''' bt1 = BedTool.from_dataframe(bins_1) bt2 = BedTool.from_dataframe(bins_2) bt_intersect = bt1.intersect(bt2, **bedtools_kwargs) df_intersect = bt_intersect.to_dataframe( disable_auto_names=True, header=None, index_col=False, names=[ "chrom_1", "start_1", "end_1", "name_1", "chrom_2", "start_2", "end_2", "name_2", "overlap", ], ) return df_intersect
def intersect_variants(cnv, sv): # intersects CNVs and SVs and returns dataframes of intersections # convert report dataframes to bedtools cnv_bed = BedTool.from_dataframe(cnv) sv_bed = BedTool.from_dataframe(sv) bed_cols = ['CNV_CHROM', 'CNV_START', 'CNV_END', 'CNV_SVTYPE', 'CNV_DETAILS', 'SV_CHROM', 'SV_START', 'SV_END', 'SV_SVTYPE', 'SV_DETAILS'] # intersect SVs and CNVs with 50% reciprocal overlap intersection_rec50 = cnv_bed.intersect(sv_bed, wa=True, wb=True, F=0.5, f=0.5).saveas('intersection_rec50.bed') intersection_rec50 = pd.read_csv('intersection_rec50.bed', sep='\t', names=bed_cols) # make sure CNV and SV are same variant type intersection_rec50 = intersection_rec50[intersection_rec50['CNV_SVTYPE'] == intersection_rec50['SV_SVTYPE']] intersection_rec50['INTERSECTION'] = ['rec50']*len(intersection_rec50) # intersect SVs and CNVs with any overlap intersection_any = cnv_bed.intersect(sv_bed, wa=True, wb=True).saveas('intersection_any.bed') intersection_any = pd.read_csv('intersection_any.bed', sep='\t', names=bed_cols) # make sure CNV and SV are same variant type intersection_any = intersection_any[intersection_any['CNV_SVTYPE'] == intersection_any['SV_SVTYPE']] intersection_any['INTERSECTION'] = ['any']*len(intersection_any) intersection_all = pd.concat([intersection_rec50, intersection_any], ignore_index=True) intersection_all = intersection_all.groupby(bed_cols)['INTERSECTION'].apply(','.join).reset_index() return intersection_all
def exclusion_regions(blacklist_file, chip_seq_data): """ This function takes as input a bound bed file (from multiGPS). The assumption is that the bed file reports the peak center For example: chr2 45 46 It converts these peak centers into 501 base pair windows, and adds them to the exclusion list which will be used when constructing negative sets. It also adds the mm10 blacklisted windows to the exclusion list. Parameters: blacklist_file (str): Path to the blacklist file. chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data Returns: exclusion_windows (BedTool): A bedtools object containing all exclusion windows. bound_exclusion_windows (BedTool): A bedtool object containing only those exclusion windows where there exists a binding site. """ temp_chip_file = chip_seq_data.copy() # Doesn't modify OG array. temp_chip_file['start'] = temp_chip_file['start'] - 250 temp_chip_file['end'] = temp_chip_file['end'] + 250 if blacklist_file is None: print('No blacklist file specified ...') exclusion_windows = BedTool.from_dataframe( temp_chip_file[['chr', 'start', 'end']]) else: bound_exclusion_windows = BedTool.from_dataframe( temp_chip_file[['chr', 'start', 'end']]) blacklist_exclusion_windows = BedTool(blacklist_file) exclusion_windows = BedTool.cat( *[blacklist_exclusion_windows, bound_exclusion_windows]) return exclusion_windows
def map_ID2(motifs, geneIDtable, output_bed): """function to rename the TF column values in a motifs.bed file to the Arabidopsis gene ID nomenclature using geneIDtable. (for DAP-seq cistrome motifs only). Outputs a bed file.""" # remove '_m' from end of name_rep value in motifs motifs.name_rep = motifs.name_rep.str.replace("_m1", "") merged = pd.merge(motifs, geneIDtable, on="name_rep") # print(merged.shape) # make bed file sorted_motifs = merged.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_motifs).saveas(output_bed)
def overlapTAD(pDataFrameTAD, pDataFrameProtein): tad_bedtool_x = BedTool.from_dataframe(pDataFrameTAD) log.debug('tad_bedtool_x {}'.format(tad_bedtool_x)) protein_bedtool = BedTool.from_dataframe(pDataFrameProtein) x = tad_bedtool_x.intersect(protein_bedtool, c=True).to_dataframe() mask_x = x['name'] >= 1 return pd.Series(mask_x)
def annotate_peaks(peaks, ref_path): """ peak to gene annotation strategy: 1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak 2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak 3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak This step is optional 4. call it an intergenic peak """ ref_mgr = ReferenceManager(ref_path) tss = BedTool(ref_mgr.tss_track) # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites if tss.field_count() == 7: tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tss = tss.to_dataframe() df_tss['gene_type'] = '.' tss_filtered = BedTool.from_dataframe(df_tss).saveas() # including transcripts.bed is optional if ref_mgr.transcripts_track is None: transcripts_filtered = BedTool([]) else: transcripts = BedTool(ref_mgr.transcripts_track) if transcripts.field_count() == 7: transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tx = transcripts.to_dataframe() df_tx['gene_type'] = '.' transcripts_filtered = BedTool.from_dataframe(df_tx).saveas() # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas() results = [] peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas() # avoid error when no peaks overlap with any transcipts if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss): peaks_nearby_tss_and_tx = peaks_nearby_tss \ .intersect(transcripts_filtered, wa=True, wb=True) \ .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct']) for peak in peaks_nearby_tss_and_tx: results.append(get_peak_nearby_genes(peak)) for peak in peaks_nearby_tss_butno_tx: results.append(get_peak_nearby_genes(peak)) return results
def preprocess_bam_to_bed(bam, output): ''' Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are ''' # convert bam to bed vprint("Converting bam to bed...") bam = BedTool(bam) bed = bam.bam_to_bed() # filter intervals vprint("Filter reads by size...") bed_chunk_iter = bed.to_dataframe(chunksize=10000000) # chunk large file chunks = [] for chunk in bed_chunk_iter: keep = ( chunk[["start", "end"]] .swifter.progress_bar(enable=True, desc=bam) .apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1) ) chunks.append(chunk[keep]) bed_df = pd.concat(chunks) # 101bp interval for input vprint("Define 101bp intervals...") bed_df["end"] = ( bed_df["start"].swifter.progress_bar( enable=True).apply(define_interval) ) bed_df["name"] = "-" # remove duplicates vprint("Drop duplicate intervals...") bed_df.drop_duplicates(inplace=True) # TODO extraneous chromosomes? vprint("Remove extra chromosomes...") chromosomes = list(range(1, 23)) chromosomes.append('X') chromosomes.append('Y') chromosomes = [f'chr{c}' for c in chromosomes] bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)] # Save result vprint(f"Saving {bed_df.shape[0]} intervals...") BedTool.from_dataframe(bed_df).moveto(output) # cleanup tmp files pybedtools.cleanup(remove_all=True) vprint("Done.")
def annotate_graph_with_feature_values(_input_graph, graph_name_col2bed, path_feature, feature_name, _feature_score): input_graph = _input_graph name_col2bed = graph_name_col2bed ## Default "name" Vs_Attrs_Name = feature_name ## such as 'Tcf1' if (Vs_Attrs_Name not in _input_graph.vs.attributes()): ## Convert vs to bed format in order to annotate df_vs_bed = convert_vs2bed(input_graph, name_col2bed) ### df_vs_bed to be annoted Feature_vs = BedTool.from_dataframe(df_vs_bed).sort() PATH_Feature_A = path_feature ## df_A = pd.read_csv(PATH_Feature_A, sep="\t") Feature_A = BedTool.from_dataframe(df_A).sort() ## annotate A in vs Feature_vs_with_A = Feature_vs.intersect(Feature_A, wb=True, F=0.3) if (len(Feature_vs_with_A) > 0): df_vs_with_A = pd.read_csv(Feature_vs_with_A.fn, sep="\t", names=df_vs_bed.columns.append( df_A.columns).values, header=None) else: df_vs_with_A = pd.DataFrame( columns=df_vs_bed.columns.append(df_A.columns)) vs_score = _feature_score ## 'such as logFC' vs_attrs_score = Vs_Attrs_Name + '_' + vs_score input_graph.vs[Vs_Attrs_Name] = 0 input_graph.vs[vs_attrs_score] = 0 for df_vs in df_vs_with_A.groupby( name_col2bed): ### Default Define vertex attribute "name" input_graph.vs.select( name=df_vs[0])[Vs_Attrs_Name] = df_vs[1].shape[0] ### max Tcf1 binding if (type(df_vs[1].loc[:, vs_score].head(1).values[0]) == str): input_graph.vs.select( name=df_vs[0] )[vs_attrs_score] = df_vs[1].loc[:, vs_score].max() else: #print(df_vs[1].loc[:,vs_score]) input_graph.vs.select( name=df_vs[0] )[vs_attrs_score] = df_vs[1].loc[:, vs_score].mean() print("Annotate " + Vs_Attrs_Name + " is finished.") else: print("Feature of " + Vs_Attrs_Name + " is already annoated. Skip.") return input_graph
def get_bed( self, with_id=True, bedtools=False, slop=None, chrom_size_path=None, standardize_length=None, ): if chrom_size_path is None: chrom_size_path = self.chrom_size_path # will be none if not exist region_dim = self.region_dim bed_df = pd.DataFrame({ "chrom": self.coords[f"{region_dim}_chrom"], "start": self.coords[f"{region_dim}_start"], "end": self.coords[f"{region_dim}_end"], }) # standardize region length, used in motif enrichment analysis if standardize_length is not None: # standardize_length is an int number region_center = bed_df["start"] + (bed_df["end"] - bed_df["start"]) // 2 bed_df["start"] = region_center - 1 bed_df["end"] = region_center slop = ( standardize_length // 2 ) # use the bedtools slop to extend the center to standard length if with_id: bed_df["name"] = self.get_index(region_dim).tolist() bed = None if slop is not None and slop > 0: if chrom_size_path is None: raise ValueError( "Must provide chrom_size_path when slop is not None.") with warnings.catch_warnings(): warnings.simplefilter("ignore") bed = BedTool.from_dataframe(bed_df).slop(b=slop, g=chrom_size_path) if not bedtools: bed_df = bed.to_dataframe() if bedtools: if bed is None: bed = BedTool.from_dataframe(bed_df) return bed else: bed_df.index = self.get_index(self.region_dim) return bed_df
def get_binned_modules(ma=None, a=annotations450, b='lola_vignette_data/activeDHS_universe.bed', include_last=False, min_capsule_len=2000): allcpgs = ma.beta.columns.values a = BedTool(a) b = BedTool(b) # a.saveas('a.bed') # b.saveas('b.bed') a_orig = a df = BedTool(a).to_dataframe() df.iloc[:, 0] = df.iloc[:, 0].astype(str) #.map(lambda x: 'chr'+x.split('.')[0]) df = df.set_index('name').loc[list( ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]] a = BedTool.from_dataframe(df) # df_bed=pd.read_table(b,header=None) # df_bed['features']=np.arange(df_bed.shape[0]) # df_bed=df_bed.iloc[:,[0,1,2,-1]] # b=BedTool.from_dataframe(df) # a=BedTool.from_dataframe(df_bed)#('lola_vignette_data/activeDHS_universe.bed') df_bed = BedTool(b).to_dataframe() if df_bed.shape[1] < 4: df_bed['features'] = np.arange(df_bed.shape[0]) b = BedTool.from_dataframe(df_bed) try: c = b.intersect(a, wa=True, wb=True).sort() # c.saveas('c.bed') d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct')) except: df = BedTool(a_orig).to_dataframe() df.iloc[:, 0] = df.iloc[:, 0].astype(str).map( lambda x: 'chr' + x.split('.')[0]) df = df.set_index('name').loc[list( ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]] a = BedTool.from_dataframe(df) c = b.intersect(a, wa=True, wb=True).sort() # c.saveas('c.bed') d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct')) #d.saveas('d.bed') df2 = d.to_dataframe() df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len] modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values] modulecpgs = np.array(list(set(list(reduce(lambda x, y: x + y, modules))))) if include_last: missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist() final_modules = modules + ([missing_cpgs] if include_last else []) module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) + '_' + df3.iloc[:, 2].astype(str)).tolist() return final_modules, modulecpgs.tolist(), module_names
def prepare_gat( df, promoter_TATA_intersect_bed, TATA_box_locations, file_names, output_genecat_prefix, promoterpref, variable1_name, variable2_name, ): """prepare files for running gat analysis - outputs a workspace file containing all promoters, a variable promoter file and a constitutive promoter file""" # make buffer to save promoters buffer = io.StringIO() df.to_csv(buffer, sep="\t", header=None, index=False) buffer.seek(0) # select only constitutive and variable genes df = df[(df.gene_type == variable1_name) | (df.gene_type == variable2_name)] # reorder columns df_reordered = df[[ "chr", "start", "stop", "gene_type", "strand", "source", "attributes", "AGI", ]] # sort by chromosome and start sorted_motifs = df_reordered.sort_values(["chr", "start"]) # save bed file BedTool.from_dataframe(sorted_motifs).saveas( f"../../data/output/{file_names}/TATA/{output_genecat_prefix}_{promoterpref}_nocontrol.bed" ) # run bedtools intersect between TATAbox_location_renamed.bed and the extracted promoters TATAlocations = BedTool(TATA_box_locations) promoters = BedTool(buffer) promoters.intersect(TATAlocations, wao=True, output=promoter_TATA_intersect_bed) # make a new gat workspace file with all promoters (first 3 columns) BedTool.from_dataframe(sorted_motifs[["chr", "start", "stop"]]).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_workspace.bed" ) # select only variable promoters variable_promoters_extended = sorted_motifs[sorted_motifs["gene_type"] == variable2_name] sorted_variable = variable_promoters_extended.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_variable).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable2_name}.bed" ) # make a constitutive only file constitutive_promoters = sorted_motifs[sorted_motifs["gene_type"] == variable1_name] sorted_constitutive = constitutive_promoters.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_constitutive).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable1_name}.bed" )
def overlapLoop(pDataFrameLoop, pDataFrameProtein): loop_bedtool_x = BedTool.from_dataframe(pDataFrameLoop[[0, 1, 2]]) loop_bedtool_y = BedTool.from_dataframe(pDataFrameLoop[[3, 4, 5]]) protein_bedtool = BedTool.from_dataframe(pDataFrameProtein) x = loop_bedtool_x.intersect(protein_bedtool, c=True).to_dataframe() y = loop_bedtool_y.intersect(protein_bedtool, c=True).to_dataframe() mask_x = x['name'] >= 1 mask_y = y['name'] >= 1 selection = (mask_x) & (mask_y) return selection
def calculate_pvalue_for_hub(_PATH_interaction, _df_Hubs, _col_fore, _col_back): ## Calculate pvalue for each hub PATH_interaction = _PATH_interaction col_fore = _col_fore col_back = _col_back df_Hub_top = convert_cluster2bed(_df_Hubs, 'hub_name').reset_index().drop('index', axis=1) ## Associated each Hub with interaction and pvalue ######################################################################################################## df_inter = pd.read_csv(PATH_interaction, sep="\t").fillna(0) df_inter = df_inter[df_inter.iloc[:, 1] != df_inter.iloc[:, 2]] df_inter.loc[:, '#chr'] = 'chr' + df_inter.iloc[:, 0].astype(str) Feature_interaction = BedTool.from_dataframe(df_inter).sort() Feature_hub = BedTool.from_dataframe(df_Hub_top).sort() ######################################################################################################## ## calculate all interactions inside a hub Feature_Hub_interaction = Feature_hub.intersect(Feature_interaction, wa=True, wb=True, F=1.0) col_name = df_Hub_top.columns.append(df_inter.columns) df_Feature_Hub_interaction = pd.read_csv(Feature_Hub_interaction.fn, sep='\t', names=col_name) df_Feature_Hub_interaction_group = df_Feature_Hub_interaction.groupby( 'hub_name') ######################################################################################################## ### calculate a pvalue for each hub hub_sum = [] for hub in df_Feature_Hub_interaction_group: #print (hub[0]) df_hub = hub[1] data_for_test = df_hub.loc[:, col_back] - df_hub.loc[:, col_fore] w, pvalue_hub = stats.wilcoxon(data_for_test) #, alternative='less') hub_sum.append([hub[0], df_hub.Num_vertices.unique()[0], pvalue_hub]) #break df_hub_summary = pd.DataFrame( data=hub_sum, columns=['hub_name', 'Num_vertices', 'pvalue']) df_hub_summary = df_Hub_top.merge(df_hub_summary, on=['hub_name', 'Num_vertices'], how='inner').sort_values(by='pvalue') return df_hub_summary
def get_tss_info(peak_str_list, ref_genome, verbose=True): """ Get annotation about Transcription Starting Site (TSS). Args: peak_str_list (list of str): list of peak_id. e.g., [“chr5_0930303_9499409”, “chr11_123445555_123445577”] ref_genome (str): reference genome name. verbose (bool): verbosity. """ SUPPORTED_REF_GENOME_LIST = [] for refs in SUPPORTED_REF_GENOME.values(): SUPPORTED_REF_GENOME_LIST += refs if ref_genome not in SUPPORTED_REF_GENOME_LIST: raise ValueError( ref_genome, " is not supported currently. Supported refgenomes are ", SUPPORTED_REF_GENOME) ref = _load_tss_ref_data(ref_genome=ref_genome) queue = list_peakstr_to_df(peak_str_list) queue = BedTool.from_dataframe(queue) annotated = annotate_tss(tss_ref_bed=ref, queue_bed=queue, verbose=verbose) return annotated
def get_fasta(intervals, reference, tab=True, s=True, name=True, mirna=False): """ function extract fasta file sequences from reference genome based on bed file intervals. return a pd dataframe of id,sequence. paramenters: intervals=pandas dataframe reference=path to reference genome tab=True s=True name=True """ if not mirna: bed_obj = BedTool.from_dataframe(intervals) else: bed_obj = BedTool(intervals) ref_obj = BedTool(reference) a = bed_obj.sequence(fi=ref_obj, tab=tab, s=s, name=name) seq_tab = pd.read_csv( a.seqfn, header=None, names=["fasta_id", "binding_sequence"], sep="\t" ) seq_tab["binding_sequence"] = seq_tab["binding_sequence"].str.upper() return seq_tab
def remove_black_list_region(adata, black_list_path, f=0.2): """ Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path Parameters ---------- adata black_list_path Path to the black list bed file f Fraction of overlap when calling bedtools intersect Returns ------- None """ with warnings.catch_warnings(): warnings.simplefilter("ignore") feature_bed_df = adata.var[['chrom', 'start', 'end']] feature_bed = BedTool.from_dataframe(feature_bed_df) black_list_bed = BedTool(black_list_path) black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True) black_feature_index = black_feature.to_dataframe().set_index( ['chrom', 'start', 'end']).index black_feature_id = pd.Index(feature_bed_df.reset_index().set_index( ['chrom', 'start', 'end']).loc[black_feature_index]['region']) print(f'{black_feature_id.size} features removed due to overlapping' f' (bedtools intersect -f {f}) with black list regions.') adata._inplace_subset_var(~adata.var_names.isin(black_feature_id)) return
def main(args=None): args = parse_arguments().parse_args(args) lowest_resolution = args.lowestResolution files = args.inputFiles outfile_name = args.outFileName dataframe = None for file in files: if dataframe is None: dataframe = readFile(file) else: dataframe = dataframe.append(readFile(file), ignore_index=True) dataframe_bedtool = BedTool.from_dataframe(dataframe) dataframe = dataframe_bedtool.sort().to_dataframe(disable_auto_names=True, header=None) dataframe.drop_duplicates(keep=False, inplace=True) tuples_x = [tuple(x) for x in dataframe[[0, 1, 2]].values] tuples_y = [tuple(x) for x in dataframe[[3, 4, 5]].values] result_list_index = mergeLoops(dataframe, lowest_resolution, tuples_x, tuples_y) result_dataframe = dataframe.iloc[sorted(result_list_index), :] result_dataframe.to_csv(outfile_name, sep='\t', header=False, index=False)
def remove_black_list_region(self, var_dim, black_list_path, f=0.2): """ Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path Parameters ---------- var_dim Name of var_dim black_list_path Path to the black list bed file f Fraction of overlap when calling bedtools intersect Returns ------- MCDS """ with warnings.catch_warnings(): warnings.simplefilter("ignore") feature_bed_df = self.get_feature_bed(var_dim=var_dim) feature_bed = BedTool.from_dataframe(feature_bed_df) black_list_bed = BedTool(black_list_path) black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True) black_feature_index = black_feature.to_dataframe().set_index( ['chrom', 'start', 'end']).index black_feature_id = pd.Index(feature_bed_df.reset_index().set_index( ['chrom', 'start', 'end']).loc[black_feature_index][var_dim]) print( f'{black_feature_id.size} {var_dim} features removed due to overlapping' f' (bedtools intersect -f {f}) with black list regions.') with dask.config.set(**{'array.slicing.split_large_chunks': False}): mcds = self.sel( {var_dim: ~self.get_index(var_dim).isin(black_feature_id)}) return mcds
def remove_black_list_region(self, var_dim, black_list_path, f=0.2): """ Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path Parameters ---------- var_dim Name of var_dim black_list_path Path to the black list bed file f Fraction of overlap when calling bedtools intersect Returns ------- MCDS (xr.Dataset) """ feature_bed_df = self.get_feature_bed(var_dim=var_dim) feature_bed = BedTool.from_dataframe(feature_bed_df) black_list_bed = BedTool(black_list_path) black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True) black_feature_index = black_feature.to_dataframe().set_index( ['chrom', 'start', 'end']).index black_feature_id = pd.Index(feature_bed_df.reset_index().set_index( ['chrom', 'start', 'end']).loc[black_feature_index][var_dim]) print( f'{black_feature_id.size} {var_dim} features removed due to overlapping' f' (bedtools intersect -f {f}) with black list regions.') mcds = self.sel( {var_dim: ~self.get_index(var_dim).isin(black_feature_id)}) return mcds
def get_mutations_MB_not_CGC(): mapp_file = 'data/megabase_probability/hg19.mappable.1Mb.windows.bed.extra.gz' genes = pd.read_csv( 'data/megabase_probability/cgc_exonic_regions.tsv', sep='\t', names=['chr', 'p1', 'p2', 'strand', 'ID1', 'ID2', 'symbol']) genes['CHR'] = genes['chr'].apply(lambda x: 'chr{}'.format(x)) genes_bed = BedTool.from_dataframe(genes[['CHR', 'p1', 'p2']]) mapp_bedtool = BedTool(mapp_file) mapping_no_CGC = mapp_bedtool.subtract(genes_bed, sorted=True) mapp_no_CGC = mapping_no_CGC.to_dataframe(names=[ 'chr', 'start', 'end', 'val', 'chr1', 'start1', 'end1', 'overlapp', 'ID', 'real_start', ]) mapp_no_CGC.to_csv('data/megabase_probability/mappable_file.nocgc.bed.gz', sep='\t', index=False, header=False, compression='gzip')
def load_files(mutation_file, cnvs_file, purity_file): # load cnvs file from purple output if os.path.isfile(cnvs_file) is not True: print('CNS file {} does not exist, exiting...'.format(cnvs_file)) sys.exit() # read CNS file and store it in BedTool format df_cns = pd.read_csv(cnvs_file, sep='\t') cnv_bed = BedTool.from_dataframe( df_cns[['#chromosome', 'start', 'end', 'copyNumber', 'baf']]) # load purity and gender if os.path.isfile(purity_file) is not True: print('Purity file file {} does not exist'.format(purity_file)) sys.exit() df_purity = pd.read_csv(purity_file, sep='\t') purity_score = np.float(df_purity['#Purity'].tolist()[0]) gender = df_purity['Gender'].tolist()[0] # read vcf df = vcf_reader(mutation_file) # get only canonical chromosomes wantedchroms = [str(i) for i in range(1, 23)] wantedchroms.append('Y') wantedchroms.append('X') # select only variants with the PASS filter and in the chromosomes we are interested in df = df[df['CHROM'].isin(wantedchroms)] df = df[df['FILTER'] == 'PASS'] return df, cnv_bed, purity_score, gender
def remove_black_list_region(adata, black_list_path, f=0.2): """ Remove regions overlap (bedtools intersect -f {f}) with regions in the black_list_path Parameters ---------- adata black_list_path Path to the black list bed file f Fraction of overlap when calling bedtools intersect Returns ------- None """ with warnings.catch_warnings(): warnings.simplefilter("ignore") feature_bed_df = adata.var[["chrom", "start", "end"]] feature_bed = BedTool.from_dataframe(feature_bed_df) black_list_bed = BedTool(black_list_path) black_feature = feature_bed.intersect(black_list_bed, f=f, wa=True) try: black_feature_index = (black_feature.to_dataframe().set_index( ["chrom", "start", "end"]).index) black_feature_id = pd.Index(feature_bed_df.reset_index().set_index( ["chrom", "start", "end"]).loc[black_feature_index][feature_bed_df.index.name]) print( f"{black_feature_id.size} features removed due to overlapping" f" (bedtools intersect -f {f}) with black list regions.") adata._inplace_subset_var(~adata.var_names.isin(black_feature_id)) except pd.errors.EmptyDataError: # no overlap with black list pass return
def chromsize2bed(chromsize, bed_file): chrom_df = pd.read_table(chromsize, sep="\t", header=None) cols = ["chr", "stop"] chrom_df.columns = cols chrom_df["start"] = 1 chrom_df = chrom_df[["chr", "start", "stop"]] # add extra columns so compatible with FIMO_filter.py chrom_df["gene"] = chrom_df.chr chrom_df["dot"] = "." chrom_df["strand"] = "+" chrom_df["source"] = "manual" chrom_df["type"] = "promoter" chrom_df["dot2"] = "." chrom_df["details"] = "none" sorted_proms = chrom_df.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_proms).saveas(bed_file)
def main(args=None): args = parse_arguments().parse_args(args) if args.method == 'loops': loop_df = readLoopFile(args.data, args.addChrPrefixLoops) if loop_df is None: log.error('Empty loop file') return loop_df_bedtool = BedTool.from_dataframe(loop_df) loop_df = loop_df_bedtool.sort().to_dataframe(disable_auto_names=True, header=None) protein_df = readProtein(args.protein, args.addChrPrefixProtein) if protein_df is None: log.error('Empty protein file') return protein_df_bedtool = BedTool.from_dataframe(protein_df) protein_df = protein_df_bedtool.sort().to_dataframe( disable_auto_names=True, header=None) protein_df_resolution = applyBinning(protein_df, args.resolution) overlap_mask_df = overlapLoop(loop_df, protein_df_resolution) loop_df_ = loop_df[overlap_mask_df] print('Protein peaks: {}'.format(len(protein_df_resolution))) print('Matched Loops: {}'.format(len(loop_df_))) print('Total Loops: {}'.format(len(loop_df))) print('Loops match protein: {}'.format(len(loop_df_) / len(loop_df))) if args.outFileName: loop_df_ = loop_df[overlap_mask_df] writeLoopFile(args.outFileName + '_matched_locations', loop_df_) with open(args.outFileName + '_statistics', 'w') as file: file.write('# HiCExplorer hicValidateLocations {}\n'.format( __version__)) file.write( '# Overlap of loop file {} with protein file {}\n#\n'. format(args.data, args.protein)) file.write('Protein peaks: {}\n'.format( len(protein_df_resolution))) file.write('Matched Loops: {}\n'.format(len(loop_df_))) file.write('Total Loops: {}\n'.format(len(loop_df))) file.write('Loops match protein: {}\n'.format( len(loop_df_) / len(loop_df)))
def get_CNAS_ccf(mutation_file, outpath): dic_cnas = load_CNAS() # only the name of the tumoral sample name = os.path.basename(mutation_file).split('_')[1].split('.')[0] # full name fullname = os.path.basename(mutation_file).split('.')[0] outpath = '{}/{}.cna.gz'.format(outpath, fullname) cna_file = dic_cnas[name] purity_file = dic_cnas[name].replace('.cnv', '.purity') # read CNAs df = pd.read_csv(cna_file, sep='\t') df['#chromosome'] = df['#chromosome'].apply(lambda x: 'chr{}'.format(x)) df['major_cn'] = round(df['baf'] * df['copyNumber']) df['minor_cn'] = round((1 - df['baf']) * df['copyNumber']) df.replace(-0.0, 0, inplace=True) filtered_df = df[(df['major_cn'] >= 0) & (df['minor_cn'] >= 0)] # read purity and gender pur = pd.read_csv(purity_file, sep='\t') purity = pur['#Purity'].tolist()[0] gender = pur['Gender'].tolist()[0].lower() # read the mutations df_muts = pd.read_csv(mutation_file, sep='\t') df_muts['POS-1'] = df_muts['POS'] - 1 mut_bed = BedTool.from_dataframe(df_muts[['CHROM', 'POS-1', 'POS']]) mut_cnas = BedTool.from_dataframe( filtered_df[['#chromosome', 'start', 'end', 'major_cn', 'minor_cn']]).coverage(mut_bed, counts=True) dfc = mut_cnas.to_dataframe(names=[ 'chromosome', 'start', 'end', 'major_cn', 'minor_cn', 'n.snv_mnv' ]) dfc['clonal_frequency'] = purity dfc['gender'] = gender dfc[[ 'chromosome', 'start', 'end', 'major_cn', 'minor_cn', 'clonal_frequency', 'gender', 'n.snv_mnv' ]].to_csv(outpath, header=True, index=False, sep='\t', compression='gzip')
def make_fasta(entry): df_slice = bed.iloc[[entry]] b = BedTool.from_dataframe(df_slice) b.sequence(fi=args.genome_file, s=strd, name=True, fo=args.outfile + '.datamatrix/temp/fastas/' + df_slice['name'].to_string().split('range_id_')[1] + '.fa')
def load_bedtool(file): import pandas as pd from pybedtools import BedTool if file.endswith('.gz'): return BedTool.from_dataframe(pd.read_csv(file, compression='gzip', index_col=None, header=None, sep="\t").iloc[:, :3]) else: return BedTool(file)