def annotate_peaks(peaks, ref_path): """ peak to gene annotation strategy: 1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak 2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak 3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak This step is optional 4. call it an intergenic peak """ ref_mgr = ReferenceManager(ref_path) tss = BedTool(ref_mgr.tss_track) # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites if tss.field_count() == 7: tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tss = tss.to_dataframe() df_tss['gene_type'] = '.' tss_filtered = BedTool.from_dataframe(df_tss).saveas() # including transcripts.bed is optional if ref_mgr.transcripts_track is None: transcripts_filtered = BedTool([]) else: transcripts = BedTool(ref_mgr.transcripts_track) if transcripts.field_count() == 7: transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tx = transcripts.to_dataframe() df_tx['gene_type'] = '.' transcripts_filtered = BedTool.from_dataframe(df_tx).saveas() # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas() results = [] peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas() # avoid error when no peaks overlap with any transcipts if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss): peaks_nearby_tss_and_tx = peaks_nearby_tss \ .intersect(transcripts_filtered, wa=True, wb=True) \ .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct']) for peak in peaks_nearby_tss_and_tx: results.append(get_peak_nearby_genes(peak)) for peak in peaks_nearby_tss_butno_tx: results.append(get_peak_nearby_genes(peak)) return results
def filter_bed(bedfile, snp_list, outfile=sys.stdout): """Filter a bedfile to only include snps in snp_list, print to outfile. :bedfile: A bed file of all the SNPs, can be gzipped. :snp_list: List/tuple/set/frozenset of snp names. :outfile: Something .bed or .bed.gz, deault STDOUT. :returns: 0 on success 1 on failure """ try: from pybedtools import BedTool except ImportError: logme.log('pybedtools is not installed.\n' + 'Please install and try again. You can get it from here:\n' + 'https://github.com/daler/pybedtools', level='error') return -1 if not isinstance(snp_list, (tuple, list, set, frozenset)): raise Exception('snp_list must be tuple/list/set/frozenset ' + 'it is: {}'.format(type(snp_list))) bed = BedTool(bedfile) filtered = bed.filter(lambda a: a.name in snp_list) with open_zipped(outfile, 'w') as fout: fout.write(str(filtered))
def clean_bed(beds=None, size_cut=None): '''This function separates a list of beds into small and large regions based on a size_cut, intersects small regions, merges large regions, and then merges the small and large regions. Parameters ---------- beds : list or array full paths to bed files (python Path objects from pathlib) size_cut : int cutoff value to separate large and small regions Returns ------- clean_bed : BedTool object resulting clean bed object ''' small_regions = list() large_regions = list() for bed in beds: bed = BedTool(bed) small_regions.append(bed.filter(lambda b: b.stop - b.start < size_cut)) large_regions.append(bed.filter(lambda b: b.stop - b.start > size_cut)) small_bed = intersect_bed(beds=small_regions) large_bed = merge_bed(beds=large_regions) clean_bed = large_bed.cat(small_bed).merge().sort() return clean_bed
def _get(relative_path, genome=None): """ :param relative_path: relative path of the file inside the repository :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20, in case of BED, the returning BedTool will be with added filter. :return: BedTools object if it's a BED file, or filepath """ chrom = None if genome: if '-chr' in genome: genome, chrom = genome.split('-') check_genome(genome) relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if not isfile(path) and isfile(path + '.gz'): path += '.gz' if path.endswith('.bed') or path.endswith('.bed.gz'): if path.endswith('.bed.gz'): bedtools = which('bedtools') if not bedtools: critical('bedtools not found in PATH: ' + str(os.environ['PATH'])) debug('BED is compressed, creating BedTool') bed = BedTool(path) else: debug('BED is uncompressed, creating BedTool') bed = BedTool(path) if chrom: debug('Filtering BEDTool for chrom ' + chrom) bed = bed.filter(lambda r: r.chrom == chrom) return bed else: return path
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('peaks', help='peaks bed') p.add_argument('exons', help='refseq exons from UCSC') p.add_argument('gtf', help='refseq gtf with feature of interest') p.add_argument('feature', help='feature of interest in the gtf') p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity') args = p.parse_args() if args.verbose: sys.stderr.write(">> building exon library...\n") exon_lib = make_exon_lib(args.exons) peaks = BedTool(args.peaks) exons = BedTool(args.exons) full_ref = BedTool(args.gtf) if args.verbose: sys.stderr.write(">> filtering for feature...\n") filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature) if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n") exonic_peaks = peaks.intersect(exons, wo=True) if args.verbose: sys.stderr.write(">> calculating distance fractions...\n") # D for distance (returns negative if upstream) for peak in exonic_peaks.closest(filtered_ref, D="a"): try: p = ComplexLine(peak) corrected_distance = 0.0 total_exon_length = 0.0 # parse gtf attrs gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "') # looking downstream wrt peak if p.gtfdistance > 0: # exon with peak corrected_distance = p.exonstop - p.peakstop for exon in exon_lib[p.exoninfo.name]: # add downstream exon lengths if exon > p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] # looking upstream wrt peak else: # exon with peak corrected_distance = p.peakstart - p.exonstart for exon in exon_lib[p.exoninfo.name]: # add upstream exon lengths if exon < p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] for exon in exon_lib[p.exoninfo.name]: total_exon_length += exon_lib[p.exoninfo.name][exon] # fraction print (corrected_distance / total_exon_length) except ValueError: continue
def getCDSs(bedfilename, reffilename, strand): """ return iterator of coding sequences """ bed = BedTool(bedfilename) bed = bed.filter(lambda x: x.strand == strand) fasta = reffilename bed = bed.sequence(fi=fasta, s=True) return SeqIO.parse(bed.seqfn, "fasta")
def window_genome(window_width_, filtered_save_name): if genome_size_file is not None: genome_windowed = BedTool().window_maker(g=genome_size_file, w=window_width_) genome_windowed.saveas(filtered_save_name) else: genome_windowed = BedTool().window_maker(genome=genome, w=window_width_) genome_windowed = genome_windowed.filter( lambda p: p.chrom in valids) genome_windowed.saveas(filtered_save_name)
def filterReadsByLength(inbam, minlength, maxlength): ''' Takes a bam file and selects intervals that are within the defined lengths. Input: bam file and min/max lengths Output: bedTool ''' # convert bam to bed intervals = BedTool(inbam).bam_to_bed() filt = intervals.filter(lambda x: len(x) > minlength and len(x) < maxlength).saveas() # print filt return filt
def filterReadsByLength(inbam, minlength, maxlength): ''' Takes a bam file and selects intervals that are within the defined lengths. Input: bam file and min/max lengths Output: bedTool ''' # convert bam to bed intervals = BedTool(inbam).bam_to_bed() filt = intervals.filter( lambda x: len(x) > minlength and len(x) < maxlength).saveas() # print filt return filt
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def _bed(self): def by_name(rec): # Drop first part before underscore. if "_" in self.name: name = "_".join(self.name.split("_")[1:]) else: name = self.name return (name + "*" in rec.name) or (name == rec.name) bt = BedTool(self.path) if not self.custom and '_all' not in self.name: bt = bt.filter(by_name).saveas() if len(bt) > 0 and len(bt[0].fields) > 6: bt = bt.bed6().saveas() return bt
class GenomicSubset(object): def __init__(self, name, path=paths.genome_subsets, assembly='hg19'): self.assembly = assembly self.name = name self.bedtool = BedTool(path + name + '.bed').sort() # Intersect the pathway with the appropriate genome build # TODO: this step should be unnecessary if the pathways are correct if name != self.assembly: self.bedtool = GenomicSubset.reference_genome( self.assembly).bedtool.intersect(self.bedtool).sort().saveas() def expand_by(self, expansion_in_each_direction_Mb): window_size_str = str(expansion_in_each_direction_Mb) + 'Mb' print('total size before window addition:', self.bedtool.total_coverage(), 'bp') # compute the flanks # TODO: use 1cM instead of 1Mb print('computing flanks') flanks = self.bedtool.flank( genome=self.assembly, b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas() # compute the union of the flanks and the pathway print('computing union') union = self.bedtool.cat(flanks, postmerge=False).sort() merged = union.merge().saveas() print('total size after window addition:', merged.total_coverage(), 'bp') self.bedtool = merged def restricted_to_chrom_bedtool(self, chrnum): return self.bedtool.filter( lambda x : x[0] == 'chr' + str(int(chrnum))).saveas() @classmethod def reference_genome(cls, assembly='hg19'): return GenomicSubset(assembly, path=paths.reference, assembly=assembly) @classmethod def reference_chrom_bedtool(cls, chrnum, assembly='hg19'): return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum) @classmethod def whole_genome(cls, assembly='hg19'): return cls(assembly, path=paths.reference)
def generateBedfileFromBam(outFile, bamFile, delta): # for the provided BAM file, generate bed files indicating adequately covered regions # print( "bam files given=", args.bamFiles ) ##TODO check whether bams are indexed and sorted, then perform indexing or sorting if needed print("generating adequately covered bed file for ", bamFile) bedFile = BedTool(bamFile).genome_coverage( bg=True) # bg=True gives read depth in bed graph format filteredBedFile = bedFile.filter( lambda x: int(x.name) >= delta ) # bedfile object carries the fourthcolumn as attribute "name" # keep only the regions with read depth >= adequate sample coverage "delta" # filteredBedFile.saveas( args.outFile ) mergedBedFile = filteredBedFile.merge( ) # merge adequately covered regions together mergedBedFile.saveas(outFile) print("bed file generation is done, written to ", outFile)
def _find_cnv_cpx_redundancies(potentially_clusterable: pybedtools.BedTool, is_carrier: Mapping[Text, numpy.ndarray], min_cpx_reciprocal_overlap: float, cnv_cpx_reciprocal_overlap: float, cnv_cpx_sample_overlap: float) -> Set[Text]: """ Subset potentially clusterable intervals to those that meet required minimum overlap with a CPX event. Then find clusters, and remove redundant CNVs from those clusters. Parameters ---------- potentially_clusterable: BedTool bed object with intervals that could potentially be used for clustering is_carrier: Mapping[Text, numpy.ndarray] Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False otherwise (including no-call). min_cpx_reciprocal_overlap: float Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable. cnv_cpx_reciprocal_overlap: float Minimum reciprocal overlap between two intervals to be part of a cluster. cnv_cpx_sample_overlap: float Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant. Returns ------- vids_to_remove: Set[Text] Set of variant IDs that are redundant and should be removed from the output VCF. """ # find all potentially clusterable intervals that meet required minimum overlap with CPX precluster_subset = potentially_clusterable.intersect( potentially_clusterable.filter(_is_cpx), u=True, f=min_cpx_reciprocal_overlap, r=True, sorted=True, nonamecheck=True) # find clusters of intervals with high reciprocal overlap, then check each cluster for redundant variant IDs return { variant_id for cluster in _get_clusters( precluster_subset, min_reciprocal_overlap=cnv_cpx_reciprocal_overlap) for variant_id in _get_redundant_cluster_cnv_cpx_vids( cluster, is_carrier, cnv_cpx_sample_overlap=cnv_cpx_sample_overlap) }
def _get_genome_bedtool(self, genome_name, region, genes=None): """get the bedtool object for a genome depending on the name and the region""" genome = Genome.path_by_name(genome_name) mapping = { "any": "all", "CDS": "cds", "3prime": "3_utr", "5prime": "5_utr", "intron": "intron", "intergenic": "intergenic" } if region not in mapping: raise ValueError("Invalid region: %r" % region) else: bed = BedTool(path.join(genome, "%s.gff" % mapping[region])) # Optionally, filter by gene. if genes is None or 'all' in genes: return bed else: return bed.filter(lambda x: x.name in genes).saveas()
def add_bed(self, bedfile): """Add a list of pybedtools Interval objects to self as self.bed. Requires pybedtools, adds only records for snps in this individual. Note: This is a slow operation. :returns: True on success, False on failure. """ try: from pybedtools import BedTool except ImportError: logme.log('add_bed() failed.\n' + 'pybedtools is not installed.\n' + 'Please install and try again. You can get it from here:\n' + 'https://github.com/daler/pybedtools', level='error') return False bed = BedTool(bedfile) self.bed = [i for i in bed.filter(lambda a: a.name in self.snps)] return True
def make_regions(region_size: int, ambiguity_thr: float = 0.5): path = os.path.join( "/tmp/", f"cached.make_regions({region_size}, {ambiguity_thr}).bed") if not os.path.exists(path): regions = BedTool().window_maker(w=region_size, genome="hg19") # type: BedTool # pybedtools is not very consistent when dealing with Interval objects. # For instance, str(interval) in some cases will return only # 3 fields (chr, start, end).Another time, when fields are specified explicitly, 4 fields and more are # printed.It is possible to invoke intersection with '3-field' Intervals and receive mixed Intervals. # Intersected are wrongly read as one with strand and name being interval-hit information, and # non-intersected are turned into intervals with default additional fields.Workaround is to recreate each # interval to include the same number of fields; strand must be included. regions = BedTool([Interval(x.chrom, x.start, x.end) for x in regions]).saveas() regions = regions.filter(lambda r: r.length == region_size and r.chrom in hg19.annotation.CHROMOSOME_SIZE).saveas() regions = dataset.filter.by_ambiguity( regions, BedTool(hg19.annotation.AMBIGUOUS), ambiguity_thr) regions.saveas(path, compressed=False) return BedTool(path)
print("Running in auto mode. Finding region types present in the GTF.") df = gtf_ref.to_dataframe().dropna() feature_list = list(df.feature.value_counts().index) else: feature_list = list(args.region_list) print print("Extracting the following regions: " + str(feature_list)) p = Pool(args.ncores) p.map(extract_features, feature_list) if args.do_introns == True: print print("Extracting intron positions and generating GTF") genes = gtf_ref.filter(lambda x: x[2] == 'gene').saveas() exons = gtf_ref.filter(lambda x: x[2] == 'exon').saveas() introns = genes.subtract(exons, s=True, nonamecheck=True).saveas() introns.saveas('gtf_regions/' + args.outfile + '_intron.gtf') if args.split_introns == True: print print( "Spliting intron in proximal and distal regions and generating GTF" ) introns_distal = introns.to_dataframe().copy() introns_distal.start = introns_distal.start + 500 introns_distal.end = introns_distal.end - 500 introns_distal_bed = BedTool.from_dataframe( introns_distal).remove_invalid().saveas('gtf_regions/' +
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, max_length=10000, generate_ID=True, output_name = None): """ Written by Pete Skene ([email protected]). Free for academic use. - need to install a more up-to-date varsion of bedtools before invoking Jupyter type: module load bedtools/2.21.0 - (1) filters bedgraph based on threshold; (2) merges adjacent basepairs that are over threshold; (3) retains peaks that satisfy min/max length criteria; (4) merges any peaks that are closer than the inter-peak distance cutoff - max length is typically defaulted to be very large - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop) - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format - note the peak score for merged peak is the *just* the sum of the two individual peaks not the total score in the merged region (i.e. there could be some sub-threshold scores in the intervening space that won't be included) -assumes bedgraph in standard format <chr> <start> <stop> <score> -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed """ import pybedtools import glob from pybedtools import BedTool import pandas as pd #generate name for output bedgraph_name = glob.glob(bedgraph) if output_name != None: filename = output_name elif output_name == None: filename = bedgraph_name[0].replace('.bg', '_peaks.bed') print 'input bedgraph file: ' + bedgraph_name[0] print 'output filename: ' + filename #import data as BedTool data = BedTool(bedgraph) #retains intervals above threshold above_thresh = data.filter(lambda b: float(b.name) >= threshold) #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) #by increasing d value can allow for merge_regions= above_thresh.merge(d=0, c=4, o='sum' ) #filter based on length criteria peaks = BedTool(merge_regions.filter(lambda x: len(x) >= min_length and len(x) <= max_length)) #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort merge_peaks = peaks.merge(d=inter_peak_distance, c= 4, o='sum').sort() print 'number of peaks found: ' + str(merge_peaks.count()) if not generate_ID: print 'saving sorted peak bed file with no ID' merge_peaks.saveas(filename) if generate_ID: print 'saving sorted peak bed file with ID names' #change to pandas dataframe DF_peaks = merge_peaks.to_dataframe() #insert new column with id: 1.... # of peaks DF_peaks.insert(3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks)+1))]) ['id' + str(item) for item in range(1, 5)] #save output DF_peaks.to_csv(filename, sep = '\t', header = False, index = False) return 'Finished'
def create( cls: Type[T], outdir: str, data_files: List[str], enhancer_file: str, annotation_file: str, genome: str, window: Optional[int] = 2000, anno_file: Optional[str] = None, anno_from: Optional[str] = None, anno_to: Optional[str] = None, gene_mapping: Optional[str] = None, threshold: Optional[float] = 1.0, version: Optional[str] = "0.1.0", ) -> T: outdir = Path(outdir) basename = outdir.name meanstd_file = outdir / f"{basename}.{genome}.meanstd.tsv.gz" target_file = outdir / f"{basename}.{genome}.target.npz" gene_file = outdir / "annotation.tss.merged1kb.bed" link_file = outdir / "enhancers2genes.feather" g = Genome(genome) if not os.path.exists(outdir): os.makedirs(outdir) info = { "genes": "genes.txt", "enhancers": "enhancers.feather", "link_file": os.path.basename(link_file), "genome": genome, "window": window, "meanstd_file": os.path.basename(meanstd_file), "target_file": os.path.basename(target_file), "gene_file": os.path.basename(gene_file), "version": version, "schema_version": __schema_version__, } if anno_file is not None: if not os.path.exists(anno_file): raise ValueError(f"{anno_file} does not exist") if anno_from is None or anno_to is None: raise ValueError("Need anno_from and anno_to columns!") copyfile(anno_file, outdir / os.path.basename(anno_file)) info.update({ "anno_file": os.path.basename(anno_file), "anno_from": anno_from, "anno_to": anno_to, }) if gene_mapping is not None: if not os.path.exists(gene_mapping): raise ValueError(f"{gene_mapping} does not exist") copyfile(gene_mapping, outdir / os.path.basename(gene_mapping)) info["gene_mapping"] = os.path.basename(gene_mapping) logger.info("processing gene annotation") # Convert gene annotation b = BedTool(annotation_file) chroms = set([f.chrom for f in BedTool(enhancer_file)]) b = b.filter(lambda x: x.chrom in chroms) b = (b.flank(g=g.sizes_file, l=1, r=0).sort().merge(d=1000, c=4, o="distinct")) # noqa: E741 b.saveas(str(gene_file)) logger.info("processing data files") # create coverage_table df = coverage_table( enhancer_file, data_files, window=window, log_transform=True, normalization="quantile", ncpus=12, ) df.index.rename("loc", inplace=True) df.reset_index().to_feather(f"{outdir}/enhancers.feather") np.savez(target_file, target=df.iloc[:, 0].sort_values()) meanstd = pd.DataFrame(index=df.index, ) meanstd["mean"] = df.mean(1) meanstd["std"] = df.std(1) meanstd = meanstd.reset_index().rename(columns={"loc": "index"}) meanstd.to_csv(meanstd_file, compression="gzip", index=False, sep="\t") df.index.rename("loc", inplace=True) df = df.sub(df.mean(1), axis=0) df = df.div(df.std(1), axis=0) df.reset_index().to_feather(f"{outdir}/enhancers.feather") link = create_link_file(meanstd_file, gene_file, genome=genome) link.to_feather(link_file) genes = _create_gene_table( df, meanstd_file, gene_file, gene_mapping, genome=genome, link_file=link_file, threshold=threshold, ) genes.to_csv(f"{outdir}/genes.txt", sep="\t") with open(f"{outdir}/info.yaml", "w") as f: yaml.dump(info, f) return ScepiaDataset(outdir)
def _update_cnv_cnv_redundances(vids_to_remove: Set[Text], potentially_clusterable: pybedtools.BedTool, is_carrier: Mapping[Text, numpy.ndarray], is_ref: Mapping[Text, numpy.ndarray], cnv_cnv_reciprocal_overlap: float, cnv_cnv_sample_overlap: float): """ Update vids_to_remove by finding CNVs that are redundant with other CNVs (as opposed to CPX) -Find CNVs with very high reciprocal overlap, and very high carrier sample Jaccard index -For each CNV that is connected to any other CNVs Add that CNV and all its connections to vids_to_remove Find the "best" CNV: the maximum choosing 1st by number of carriers, 2nd by number of called refs Add the best CNV to set of vids that will be put back in (no matter what, even if previously or subsequently "removed") -Update vids_to_remove by removing the "best" variant IDs Parameters ---------- vids_to_remove: Set[Text] set of variant IDs that are redundant and should be removed. NOTE: this function updates this set in place. potentially_clusterable: BedTool bed object with intervals that could potentially be used for clustering is_carrier: Mapping[Text, numpy.ndarray] Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False otherwise (including no-call). is_ref: Mapping[Text, numpy.ndarray] Map from variant ID to boolean array that is True for samples called ref for this Variant, and False otherwise (including no-call). cnv_cnv_reciprocal_overlap: float minimum reciprocal overlap for two CNVs to be connected cnv_cnv_sample_overlap: float minimum carrier samples Jaccard index for two CNVs to be connected """ # for each non-CPX interval, find all non-CPX intervals it has sufficient reciprocal overlap and sample overlap with variant_pairwise_connections = {} non_cpx_potentially_clusterable = potentially_clusterable.filter( _is_not_cpx).saveas() for name_1, name_2 in _iter_pairwise_connections( non_cpx_potentially_clusterable, min_reciprocal_overlap=cnv_cnv_reciprocal_overlap, min_sample_overlap=cnv_cnv_sample_overlap, is_carrier=is_carrier): variant_pairwise_connections[ name_1] = variant_pairwise_connections.get(name_1, (name_1, )) + (name_2, ) vids_to_remove.update(variant_pairwise_connections.keys() ) # set all the clustered variants to be removed # for each of these variant and its direct connections # - choose one "best" variant to represent it, with priority given to most carriers, followed by most ref calls # - keep the "best" variant (even if it's previously or subsequently "removed") and remove all others num_carrier = { variant_id: variant_is_carrier.sum() for variant_id, variant_is_carrier in is_carrier.items() } num_ref = { variant_id: variant_is_ref.sum() for variant_id, variant_is_ref in is_ref.items() } def _best_variant_id(variant_id: Text) -> (int, int, str): return num_carrier[variant_id], num_ref[variant_id], variant_id # then remove the best ones vids_to_remove.difference_update( max(variant_id_cluster, key=_best_variant_id) for variant_id_cluster in variant_pairwise_connections.values())
maxdist=6 seqnames=list() for a in amplicons: seqnames.append(a.chrom) def chrom_filter(feature,chrom): return feature.chrom==chrom for chr in set(seqnames): left_lengths=dict() right_lengths=dict() a_starts=dict() a_ends=dict() amplicons_chrom=amplicons.filter(chrom_filter, chrom=chr) for a in amplicons_chrom: left_lengths[a.name]=map(int,a.fields[10].split(","))[0] right_lengths[a.name]=map(int,a.fields[10].split(","))[1] a_starts[a.name]=a.start a_ends[a.name]=a.stop chrom=chr[3:] print chrom for read in samfile.fetch(str(chrom)): if read.is_reverse: if read.is_unmapped==False: dists=dict() abs_dists=dict() for k, v in a_ends.items(): dists[k]=read.reference_end-v
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, merge_close_peaks, keep_highest_close_peak, max_length, generate_ID, output_name, delete_overlap_bed): import pybedtools import glob from pybedtools import BedTool import pandas as pd import csv if merge_close_peaks == keep_highest_close_peak: print 'Exiting... merge_close_peaks and keep_highest_close_peak set the same' sys.exit() #generate name for output bedgraph_name = glob.glob(bedgraph) filtered_name = bedgraph_name[0].replace('.bedgraph', 'filtered.bedgraph') if output_name != 'None': filename = output_name elif output_name == 'None': filename = bedgraph_name[0].replace('.bedgraph', '_peaks.bed') print 'input bedgraph file: ' + bedgraph_name[0] print 'output filename: ' + filename #import data as BedTool data = BedTool(bedgraph) print 'total sites read: ', print len(data) #retains intervals above threshold above_thresh = data.filter( lambda b: float(b.name) >= float(threshold)).saveas(filtered_name) print 'sites above threshold: ', print len(above_thresh) if len(above_thresh) == 0: print 'no regions are above the threashold\n' sys.exit() #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) #d max distance between merged peaks, c: column modified merge_regions = above_thresh.merge(d=10, c=4, o='sum').saveas('temp.bed') #filter based on length criteria peaks = BedTool( merge_regions.filter(lambda x: len(x) >= min_length and len(x) <= max_length)).saveas('temp2.bed') print 'number of regions identified: ' + str(peaks.count()) if merge_close_peaks == 'True': if len(peaks) == 0: #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort print 'merging peaks that are closer than: ' + str( inter_peak_distance) merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort().saveas('temp3.bed') if len(peaks) > 0: print 'no regions can be merged' merge_close_peaks = 'False' keep_highest_close_peak = 'True' if keep_highest_close_peak == 'True': #need to read each line to find close peaks and throw away the one with the lowest score out of the two print 'entering loop' # if len(peaks) > 0: peaks.saveas('temp_input.bed') #print 'before keeping highest, number of regions identified: ' + str(BedTool('temp_input.bed').count()) last_line = [ str(item) for item in (BedTool('temp_input.bed').to_dataframe().tail( n=1).iloc[0, :].tolist()) ] with open('temp_input.bed') as myfile: with open('test_output.bed', 'w') as output: file_output = csv.writer(output, delimiter='\t') prev_line = None for line in csv.reader(myfile, delimiter='\t'): print 'testing line: ' + str(line) if prev_line is None: prev_line = line print elif float(prev_line[2]) + float( inter_peak_distance) <= float(line[1]): print 'prev_line: ' + str(prev_line) print 'line: ' + str(line) print 'features far apart, so adding' print file_output.writerow(prev_line) prev_line = line else: print 'prev_line: ' + str(prev_line) print 'line: ' + str(line) print 'features must be close' print if float(prev_line[3]) < float(line[3]): prev_line = line print 'prev_line smaller, so new prev_line' print 'prev_line: ' + str(prev_line) print print 'finished reading lines' print line print last_line if line == last_line: print 'must be last line' file_output.writerow(prev_line) merge_peaks = BedTool('test_output.bed') sys.exit() print 'number of peaks found: ' + str(merge_peaks.count()) if delete_overlap_bed != None: print 'delete_overlap_bed provided: ' + delete_overlap_bed merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True) print 'number of peaks retained: ' + str(merge_peaks.count()) if not generate_ID: print 'saving sorted peak bed file with no ID' merge_peaks.saveas(filename) if generate_ID: print 'saving sorted peak bed file with ID names' #change to pandas dataframe DF_peaks = merge_peaks.to_dataframe() #insert new column with id: 1.... # of peaks DF_peaks.insert( 3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))]) ['id' + str(item) for item in range(1, 5)] #save output DF_peaks.to_csv(filename, sep='\t', header=False, index=False) return 'Finished'
def find_longest_transcript(input, output, clip_start=0, clip_end=0, clip_strand_specific=False, output_strand_specific=False): tmp_file = tempfile.NamedTemporaryFile(delete=False) # Load GFF file annotations = BedTool(input) # Select only transcripts and convert to BED format transcripts = annotations.filter(filter_transcript).\ each(gff2bed, name_field="gene_id").sort().\ saveas(tmp_file.name).\ to_dataframe().\ assign(length=lambda x: x.end - x.start + 1) # Select longest transcript per gene transcripts_longest = transcripts.loc[transcripts.reset_index().groupby(['name'])['length'].idxmax()].\ drop(['length'], axis=1) # Clip at the end or the beginning of the gene (can be strand specific) pos_strand = (transcripts_longest["strand"] == "+").values | np.invert(clip_strand_specific) if clip_start > 0: transcripts_longest.loc[pos_strand, "start"] = transcripts_longest.loc[ pos_strand, "start"] + clip_start transcripts_longest.loc[ ~pos_strand, "end"] = transcripts_longest.loc[~pos_strand, "end"] - clip_end if clip_end > 0: transcripts_longest.loc[ pos_strand, "end"] = transcripts_longest.loc[pos_strand, "end"] - clip_end transcripts_longest.loc[~pos_strand, "start"] = transcripts_longest.loc[ ~pos_strand, "start"] + clip_start # Notify about genes with negative length transcripts_negative_length = transcripts_longest.query( "start >= end").name.values if len(transcripts_negative_length) > 0: transcripts_longest = transcripts_longest.query("end > start") print( "Removing transcripts with negative length from the output file: {}" .format(", ".join(transcripts_negative_length))) # make a copy of genes and reverse strand transcripts_longest_reversed = transcripts_longest.copy() transcripts_longest_reversed["strand"] = [ "+" if s == "-" else "-" for s in transcripts_longest_reversed["strand"].values ] transcripts_longest_reversed[ "name"] = transcripts_longest_reversed["name"] + "_rev" transcripts_longest_stack = pd.concat([transcripts_longest, transcripts_longest_reversed]).\ sort_values(["chrom", "start", "name", "strand"]).\ reset_index(drop=True) # Save final data frame to file if not output_strand_specific: transcripts_longest_stack.to_csv(output, index=False, sep="\t", header=False) pass else: for s, s_name in {'+': 'pos', '-': 'neg'}.items(): transcripts_longest_strand = transcripts_longest_stack.query( "strand == @s") transcripts_longest_strand.to_csv("{}_{}".format(output, s_name), index=False, sep="\t", header=False)
import numpy as np import pandas as pd os.chdir( '/Users/dem44/Documents/Manuscripts/cuRRBS/Figures/Figure_3/3X/exon_intron_sites/' ) #### 1. Obtain the coordinates for the 5' end and 3' end of all the introns in the # human genome (hg38). print('Retrieving introns from the human genome ...') path_to_gencode_ann = "/Users/dem44/Documents/Manuscripts/cuRRBS/Figures/Figure_1/1C/Annotation_files/gencode.v25.basic.annotation.gtf" gencode_ann = BedTool(path_to_gencode_ann).sort() protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter( lambda x: 'gene_type "protein_coding"' in x[8]).sort() exon_protein_coding_ann = gencode_ann.filter(lambda x: x[2] == 'exon').filter( lambda x: 'gene_type "protein_coding"' in x[8]).sort() intron_protein_coding_ann = protein_coding_genes_ann.subtract( exon_protein_coding_ann, s=True).sort() five_prime_ends = [x.start for x in intron_protein_coding_ann] # 0-based coordinates three_prime_ends = [(x.end - 1) for x in intron_protein_coding_ann] # 0-based coordinates chromosomes = [str(x.chrom) for x in intron_protein_coding_ann] #### 2. Find the coordinates of all the CpG sites that are found in +- 5 bp of both # of the ends of the intron (i.e. close to the exon-intron boundary). print('Finding the CpG coordinates ...')
def main(): parser = argparse.ArgumentParser( description='Use a sliding window to aggregate breaks in bed file') parser.add_argument('genome', help='Name of the model used to produce input') parser.add_argument('input', help='Input .bed file with detected breaks') parser.add_argument( 'annotations', help= 'Annotation file. If annotation file has gtf or gff extention (possibly .gz) then only transcripts are selected. If .bed file is provided then all annotations from bed file are used' ) parser.add_argument('output', help='Output .bed file with longest transcripts') parser.add_argument('-w|--window-size', dest="window_size", default=int(1e5), type=int, help='Window at which to agregate breaks number') parser.add_argument('-s|--window-step', dest="window_step", default=int(1e4), type=int, help='Step after each window') parser.add_argument('-f|--features', dest="features", action="append", nargs="*", help='Additional features to annotate input file') args = parser.parse_args() start = time.time() if args.features is None: features = [] else: features = list(itertools.chain.from_iterable(args.features)) output_dir = os.path.dirname(args.output) if not os.path.exists(output_dir): os.makedirs(output_dir) print( 'Processing "{input}" using annotation="{annotation}" window {window}/{step}. Writing output to "{output}"...' .format(input=args.input, window=args.window_size, step=args.window_step, output=args.output, annotation=args.annotations)) # Create temporary files tmp = { n: tempfile.NamedTemporaryFile(delete=False).name for n in [ "genome_bin_pos", "genome_bin_neg", "genome_bin", "breaks_bin", "results", "all_transcripts", "transcripts" ] } # Create windows template for sliding window genome_bin_pos = BedTool().window_maker( genome=args.genome, w=args.window_size, s=args.window_step).each(strand, "+").saveas(tmp["genome_bin_pos"]) genome_bin_neg = BedTool().window_maker( genome=args.genome, w=args.window_size, s=args.window_step).each(strand, "-").saveas(tmp["genome_bin_neg"]) genome_bin = genome_bin_pos.cat( genome_bin_neg, postmerge=False).sort().saveas(tmp["genome_bin"]) # Read input file dna_breaks = BedTool(args.input) # Read annotation file if re.search(r"\.(gtf|gff)(\.gz)?$", args.annotations): annotations = BedTool(args.annotations) annotations = annotations.filter(filter_transcript).\ each(gff2bed, name_field="gene_id").sort().\ saveas(tmp["all_transcripts"]).\ groupby(g="1,2,3,6", c="4,5", o="distinct").\ cut([0,1,2,4,5,3]).\ saveas(tmp["transcripts"]) elif re.search(r"\.bed$", args.annotations): annotations = BedTool(args.annotations) else: parser.error( "Annotation have to be either in gtf/gff or in bed format") bin_breaks = BedTool().intersect(a=genome_bin, b=dna_breaks, wa=True, c=True, s=True). \ saveas(tmp["breaks_bin"]) # Map breaks statistics to annotation file results = BedTool().map(a=bin_breaks, b=annotations, c="4", o="distinct").cut([0, 1, 2, 7, 6, 5]).sort().saveas( tmp["results"]) # s=True, results_df = splitDataFrameList(results.to_dataframe(), "name", ",") results_df = results_df[results_df.name != "."] results_df.to_csv(args.output, sep="\t", header=True, index=False) # Remove old temporary files for f in tmp.values(): os.remove(f) end = time.time() print("Total time: {:.1f} minutes".format((end - start) / 60))
def py_peak_calling(bedgraph, threshold, min_length, inter_peak_distance, merge_close_peaks=True, keep_highest_close_peak=False, max_length=10000, generate_ID=True, output_name=None, delete_overlap_bed=None): """ - need to install a more up-to-date varsion of bedtools before invoking Jupyter type: module load bedtools/2.21.0 (1) filters bedgraph based on threshold; (2) merges adjacent basepairs that are over threshold; (3) retains peaks that satisfy min/max length criteria; (4) merges any peaks that are closer than the inter-peak distance cutoff -or- alternatively keeps just the highest peak (this is beta functionality) - max length is typically defaulted to be very large - outputs a bed file (default col4 is the sum of the bedgraph scores; sorted by chrom;start;stop) - generate ID: will auto generate a integer list as a ID number (1... number of peaks). This will be reported as column 4 and the bedgraph scores will be shifted to column 5 as per standard bed format - note the peak score for merged peak is the *just* the sum of the two individual peaks not the total score in the merged region (i.e. there could be some sub-threshold scores in the intervening space that won't be included) -assumes bedgraph in standard format <chr> <start> <stop> <score> -output_name = option for user defined name (type with '...'), otherwise will generate name bedgraph_peaks.bed -delete_overlap_bed = option to add path to bedfile (as string), whereby any peaks that overlap this bed file will be discarded """ import pybedtools import glob from pybedtools import BedTool import pandas as pd import csv if merge_close_peaks == keep_highest_close_peak: return 'Exiting... merge_close_peaks and keep_highest_close_peak set the same' #generate name for output bedgraph_name = glob.glob(bedgraph) if output_name != None: filename = output_name elif output_name == None: filename = bedgraph_name[0].replace('.bg', '_peaks.bed') print 'input bedgraph file: ' + bedgraph_name[0] print 'output filename: ' + filename #import data as BedTool data = BedTool(bedgraph) #retains intervals above threshold above_thresh = data.filter(lambda b: float(b.name) >= threshold) #merge adjacent above threshold regions and sum bedgraph scores (assumes bedgraph score in col 4) #by increasing d value can allow for merge_regions = above_thresh.merge(d=0, c=4, o='sum') #filter based on length criteria peaks = BedTool( merge_regions.filter( lambda x: len(x) >= min_length and len(x) <= max_length)) # print 'number of regions identified before merging or filtering: ' + str(peaks.count()) if merge_close_peaks == True: #merge the bonafide peaks if they they are shorter than the inter peak distance and sum scores and sort print 'merging peaks that are closer than: ' + str(inter_peak_distance) merge_peaks = peaks.merge(d=inter_peak_distance, c=4, o='sum').sort() if keep_highest_close_peak == True: #need to read each line to find close peaks and throw away the one with the lowest score out of the two print 'entering loop' peaks.saveas('temp_input.bed') print 'before keeping highest, number of regions identified: ' + str( BedTool('temp_input.bed').count()) last_line = [ str(item) for item in (BedTool('temp_input.bed').to_dataframe().tail( n=1).iloc[0, :].tolist()) ] with open('temp_input.bed') as myfile: with open('test_output.bed', 'w') as output: file_output = csv.writer(output, delimiter='\t') prev_line = None for line in csv.reader(myfile, delimiter='\t'): # print 'testing line: ' +str(line) if prev_line is None: prev_line = line # print elif float(prev_line[2]) + float( inter_peak_distance) <= float(line[1]): # print 'prev_line: ' + str(prev_line) # print 'line: ' + str(line) # print 'features far apart, so adding' # print file_output.writerow(prev_line) prev_line = line else: # print 'prev_line: ' + str(prev_line) # print 'line: ' + str(line) # print 'features must be close' # print if float(prev_line[3]) < float(line[3]): prev_line = line # print 'prev_line smaller, so new prev_line' # print 'prev_line: ' + str(prev_line) # print # print 'finished reading lines' # print line # print last_line if line == last_line: # print 'must be last line' file_output.writerow(prev_line) merge_peaks = BedTool('test_output.bed') print 'number of peaks found: ' + str(merge_peaks.count()) if delete_overlap_bed != None: print 'delete_overlap_bed provided: ' + delete_overlap_bed merge_peaks = merge_peaks.intersect(b=delete_overlap_bed, v=True) print 'number of peaks retained: ' + str(merge_peaks.count()) if not generate_ID: print 'saving sorted peak bed file with no ID' merge_peaks.saveas(filename) if generate_ID: print 'saving sorted peak bed file with ID names' #change to pandas dataframe DF_peaks = merge_peaks.to_dataframe() #insert new column with id: 1.... # of peaks DF_peaks.insert( 3, 'id', ['id' + str(item) for item in range(1, (len(DF_peaks) + 1))]) ['id' + str(item) for item in range(1, 5)] #save output DF_peaks.to_csv(filename, sep='\t', header=False, index=False) return 'Finished'
get_gff(GFF_URL, GENOMICS_DIR) if not os.path.isfile(CPG_PATH): logging.info("Downloading CpG metadata at " + CPG_PATH) get_cpgs(CPG_URL, GENOMICS_DIR) # derive promoter gff and extract sequences from genome if os.path.isfile(PROMS_GFF_PATH): proms_bed = BedTool(PROMS_GFF_PATH) logging.info("Found proms gff at " + PROMS_GFF_PATH) else: # point to genome gff3 genome_bed = BedTool(GENOME_GFF_PATH) # filter for genes genes_bed = (genome_bed.filter(lambda x: (x[2] == 'gene') and ( x.chrom in CHROMOSOMES)).saveas(GENES_GFF_PATH)) logging.info("Extracted # genes = " + str(len(genes_bed)) + ", saved at " + GENES_GFF_PATH) # extract promoters from genes features proms_bed = genes_bed.each(func=five_prime, upstream=UPSTREAM_LENGTH, downstream=0) proms_bed, genes_data = add_geneIDs(proms_bed) with open(GENOMICS_DIR + "genes.json", 'w+') as f: json.dump(genes_data, f) proms_bed.saveas(PROMS_GFF_PATH) logging.info("Extracted # promoters = " + str(len(proms_bed)) + ", saved at " + PROMS_GFF_PATH)
def getListOfBlackZones(chrom): blackList = BedTool('../wgEncodeDacMapabilityConsensusExcludable.bed') blackListChrom = blackList.filter(lambda b: b.chrom == chrom) return [(i.start, i.end) for i in blackListChrom]
perc_intergenic_sites = [] perc_CGI_sites = [] perc_shore_sites = [] perc_shelf_sites = [] perc_promoter_CGI_sites = [] perc_promoter_non_CGI_sites = [] ## Annotation files. print('Calculating annotations ...') gencode_ann = BedTool(path_to_gencode_ann).sort() protein_coding_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter( lambda x: 'gene_type "protein_coding"' in x[8]).sort() exon_protein_coding_ann = gencode_ann.filter(lambda x: x[2] == 'exon').filter( lambda x: 'gene_type "protein_coding"' in x[8]).sort() intron_protein_coding_ann = protein_coding_genes_ann.subtract( exon_protein_coding_ann, s=True).sort() non_coding_RNA_genes_ann = gencode_ann.filter(lambda x: x[2] == 'gene').filter( lambda x: ('gene_type "Mt_rRNA"' in x[8]) or ('gene_type "Mt_tRNA"' in x[8]) or ('gene_type "miRNA"' in x[8]) or ('gene_type "misc_RNA"' in x[8]) or ('gene_type "rRNA"' in x[8]) or ('gene_type "scRNA"' in x[8]) or ('gene_type "snRNA"' in x[8]) or ('gene_type "snoRNA"' in x[8]) or ('gene_type "ribozyme"' in x[8]) or ('gene_type "sRNA"' in x[8]) or ('gene_type "scaRNA"' in x[8]) or ('gene_type "lincRNA"' in x[8])).sort() intragenic_ann = protein_coding_genes_ann.slop(g=path_to_chr_lengths, b=2500).merge().sort() intergenic_ann = intragenic_ann.complement(g=path_to_chr_lengths).sort()