def processBed(inbed, col, posF, posS, delimeter, outbasename): logging.info("Creating BedTool object.") myBedTool = BedTool(inbed) logging.info("Retrieving feature length.") bedobj = myBedTool.each(addFeatureLengthToScoreCol, col, posF, posS, delimeter).saveas(outbasename + ".bed") plotFeaturesLength(bedobj, col, posF, posS, delimeter, outbasename)
def promoter(bed, n, genome, delimiter, index, out): if os.path.exists(out): os.remove(out) gnm_d = process_genome(genome) bedobj = BedTool(bed) bedobj.each(check_name_col) fnames_list = [f.name.split(delimiter)[index] for f in bedobj] previous_g, previous_f, firstLine, obj_idx, nrecords = "", "", True, 0, len( bedobj) with open(out, 'w') as outfile: for f in bedobj: gname = get_feature_name(f.name, delimiter, index) if obj_idx == len( fnames_list) - 1: #if last line [index checking will fail] if gname != previous_g and f.strand == "+": outfile.write(add_bp_positive_feature(f, n)) elif f.strand == "+": outfile.write(str(f)) elif f.strand == "-": outfile.write(add_bp_negative_feature(f, n, gnm_d)) elif gname != previous_g and f.strand == "+": outfile.write(add_bp_positive_feature(f, n)) elif gname != previous_g and obj_idx != nrecords and fnames_list[ obj_idx + 1] == gname: # first feature of multifeature gene in negative strand outfile.write(str(f)) elif gname != previous_g: # if negative gene has only one feature outfile.write(add_bp_negative_feature(f, n, gnm_d)) elif gname == previous_g and f.strand == "+" or obj_idx != nrecords and fnames_list[ obj_idx + 1] == gname: # if positive strand or not last feature of negative gene outfile.write(str(f)) elif gname == previous_g and fnames_list[ obj_idx + 1] != gname: #if last feature of a negative strand gene outfile.write(add_bp_negative_feature(f, n, gnm_d)) obj_idx += 1 previous_g = gname outfile.close()
def processIntronicBed(inbed,lengths,outbasename,featureType): logging.info("Creating BedTool object.") myBedTool = BedTool(inbed) logging.info("Writing " + featureType + " length.") bedobj=myBedTool.each(addIntronLengthToScoreCol).saveas(outbasename + ".bed") plotIntrons(bedobj,outbasename,featureType,0) for l in lengths: smallIntObj=bedobj.each(filterByLength,l).saveas(outbasename + "_smaller" + str(l) + ".bed") plotIntrons(smallIntObj,outbasename,featureType,l)
def get_merged_exons(genes, gtf, genome_fasta, strand): ''' get all exons from specified genes, merging any overlapping exonic regions, also return their respective sequences in a dictionary object ''' gene_gtf = gtf[gtf.gene.isin(genes)] if len(gene_gtf) == 0: return pd.DataFrame(), {} gene_gtf = gene_gtf.drop('gene', axis=1) gene_strand = gene_gtf.strand.values[0] strand = gene_strand if strand == '' else strand with tempfile.NamedTemporaryFile(mode='r+') as temp_gtf: gene_gtf.to_csv(temp_gtf.name, index=False, header=False, sep='\t') # load gene GTF info, extract and merge exons g = BedTool(temp_gtf.name) exons = BedTool(subset_featuretypes(g, 'exon')) exons = exons.remove_invalid().sort().merge() exseq = exons.each(add_strand, strand) exseq = exseq.sequence(fi=genome_fasta, s=True) block_seqs = get_block_seqs(exseq) blocks = pd.DataFrame() with tempfile.NamedTemporaryFile(mode='r+') as temp_exons: exons.saveas(temp_exons.name) blocks = pd.read_csv(temp_exons, header=None, sep='\t', names=['chr', 'start', 'end']) if type(genes) == str: blocks['name'] = genes else: blocks['name'] = genes[0] # use first gene as representative blocks['score'] = '.' blocks['strand'] = strand blocks['chr'] = blocks['chr'].map(str) blocks.start = blocks.start.map(int) blocks.end = blocks.end.map(int) block_names = [] # reverse numbering if the gene is on the reverse strand if gene_strand == '-': block_names = [ '|' + str(i) for i in reversed(range(1, len(blocks) + 1)) ] else: block_names = ['|' + str(i) for i in range(1, len(blocks) + 1)] blocks['name'] = blocks['name'] + block_names return (blocks, block_seqs)
elif args.verbose: logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") else: logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") logging.info("Parsed arguments:") if args.outfile: logging.info(" outfile: enabled writing to file") logging.info(" outfile: '{}'".format(args.outfile)) logging.info(" outfile: '{}'".format(args.outfile)) logging.info("") # data processing alns = BedTool(args.infile) # select either from 5' or 3'-end if args.threeprime: clnts = alns.each(three_prime, upstream=0, downstream=1) else: clnts = alns.each(five_prime, upstream=1, downstream=0) # write to file or to stdout if args.outfile: clnts.saveas(args.outfile) else: tmptool = clnts.saveas() logging.debug("results written to temporary file :" + tmptool.fn) tmp = open(tmptool.fn) for line in tmp: stdout.write(line) tmp.close()
# first flank the introns by $flank either side def flank_introns(feature): feature.start = feature.start - int(flank) feature.end = feature.end + int(flank) return(feature) #if not os.path.exists(os.path.dirname(outFile)): # os.mkdir(os.path.dirname(outFile)) #if not outFile.endswith("csv"): # outFile = outFile + "_" + str(flank) + "_coverage.csv" # flank each intron by 100 to capture exons either side flanked = introns.each(flank_introns) # intersect the flanked introns with the iCLIP clusters #intersect = introns.intersect(clusters, s=True,wa=True, wb = True) intersect = flanked.intersect(clusters, s=stranded,wa=True, wb = True) print type(intersect) # store as dictionary intervals = dict() for feature in intersect: coord = str(feature.chrom) + ":" + str(feature.start) + "-" + str(feature.stop) length = int(feature.stop) - int(feature.start) strand = str(feature.strand) start_pos = int(feature.start) if coord not in intervals:
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") elif args.verbose: logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") else: logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") logging.info("Parsed arguments:") if args.outfile: logging.info(" outfile: enabled writing to file") logging.info(" outfile: '{}'".format(args.outfile)) logging.info(" outfile: '{}'".format(args.outfile)) logging.info("") # data processing alns = BedTool(args.infile) # select either from 5' or 3'-end if args.threeprime: clnts = alns.each(three_prime, upstream=0, downstream=1) else: clnts = alns.each(five_prime, upstream=1, downstream=0) # write to file or to stdout if args.outfile: clnts.saveas(args.outfile) else: tmptool = clnts.saveas() logging.debug("results written to temporary file :" + tmptool.fn) tmp = open(tmptool.fn) for line in tmp: stdout.write(line) tmp.close()
feature.strand = "+" return (feature) elif feature.strand == "+": feature.end = feature.start feature.start = feature.start - 10 # switch strand feature.strand = "-" return (feature) else: print "feature must be + or - " return (feature) print("getting flanking coordinates") downstream = clusters.each(get_downstream_sequence) # get fasta sequence for each coordinate print("retrieving genomic sequences") fasta = downstream.sequence(fi=reference, s=True, tab=True) # apply tests. 1. is there a stretch of 6*A or are there 7 A in any order? print("testing each cluster for A-rich sequence") # random_primers = [] # test = [] # with open(fasta.seqfn, "r") as f: # for line in f: # seq = line.strip().upper().split("\t")[1] # if "AAAAAA" in seq or seq.count("A") >= 7:
for fup, core, fdown in izip(fup_reader, core_reader, fdown_reader): assert fup[0] == core[0] == fdown[ 0], "Error: sequence ids of cores and flanks don't match." # setup fasta headers and sequences fa_header = ">" + core[0] seq_viewpoint = fup[1].lower() + core[1].upper() + fdown[1].lower() # seq_normal = fup[1].upper() + core[1].upper() + fdown[1].upper() viewpointfa.write(fa_header + "\n") viewpointfa.write(seq_viewpoint + "\n") viewpointfa.close() # prepare input coordinates bsites = BedTool(args.bsites_fn).sort().saveas() centers = bsites.each(midpoint).saveas() # prepare positive instances logging.info("preparing positive instances") if (args.chromosome_limits): logging.debug("using chromosome_limits " + args.chromosome_limits) cores = centers.slop( s=True, l=int(args.core_length / 2), # -1 to account for the center nucleotide! r=int(args.core_length / 2) + (args.core_length % 2) - 1, g=args.chromosome_limits).each(offset_zero_by_one).saveas( pos_core_bed_fn) else: cores = centers.slop( s=True,
core_reader = reader(core_tabseq, delimiter="\t") fdown_reader = reader(fdown_tabseq, delimiter="\t") for fup, core, fdown in izip(fup_reader, core_reader, fdown_reader): assert fup[0] == core[0] == fdown[0], "Error: sequence ids of cores and flanks don't match." # setup fasta headers and sequences fa_header = ">" + core[0] seq_viewpoint = fup[1].lower() + core[1].upper() + fdown[1].lower() # seq_normal = fup[1].upper() + core[1].upper() + fdown[1].upper() viewpointfa.write(fa_header + "\n") viewpointfa.write(seq_viewpoint + "\n") viewpointfa.close() # prepare input coordinates bsites = BedTool(args.bsites_fn).sort().saveas() centers = bsites.each(midpoint).saveas() # prepare positive instances logging.info("preparing positive instances") if (args.chromosome_limits): logging.debug("using chromosome_limits " + args.chromosome_limits) cores = centers.slop(s=True, l=int(args.core_length / 2), # -1 to account for the center nucleotide! r=int(args.core_length / 2) + (args.core_length % 2) - 1, g=args.chromosome_limits).each(offset_zero_by_one).saveas(pos_core_bed_fn) else: cores = centers.slop(s=True, l=int(args.core_length / 2), # -1 to account for the center nucleotide!
def subset(bed, n, keep_small, out): print("Applying subset") bedobj = BedTool(bed) bedobj.each(apply_subset, n, keep_small).saveas(out)
#!/usr/bin/env python import os import sys import subprocess sys.path.insert(0,'/mnt/lustre/home/cusanovich/Programs/lib/python2.6/site-packages/pybedtools-0.6.2-py2.6-linux-x86_64.egg/pybedtools') from pybedtools import BedTool, featurefuncs windowsize = 10000 windowname = str(windowsize/1000) + 'kb' indir = '/mnt/lustre/home/cusanovich/Kd_Arrays/GenomeAnnotations/StartAnnots/' outdir = '/mnt/lustre/home/cusanovich/Kd_Arrays/GenomeAnnotations/FinalAnnots/' annots = ['GSE31388_eQtlTable_cleaned.bed','sorted_PritchardQTLs_merged.bed','gwascatalog_ucsc_merged.bed'] #annots = ['GSE31388_eQtlTable_cleaned.bed'] jacked = BedTool('/mnt/lustre/home/cusanovich/centipede/hg19_jack_centipede_sorted_pwms_clean.bed') tss = BedTool('/mnt/lustre/home/cusanovich/Kd_Arrays/Centipede/Annotation/HT12ensemblTSScombinedsorted.bed') for annot in annots: print annot currannot = BedTool(indir + annot) currout = annot.split('.')[0] print 'Intersecting...' inter = jacked.intersect(currannot,wa=True,wb=True).moveto(outdir + windowname + '_' + currout + '_centipede_intersect.bed') inter = BedTool(outdir + windowname + '_' + currout + '_centipede_intersect.bed') print 'Calculating midpoints...' intermid = inter.each(featurefuncs.midpoint).moveto(outdir + windowname + '_' + currout + '_centipede_intersect_midpoint.bed') print 'Finding TSSs...' inter = BedTool(outdir + windowname + '_' + currout + '_centipede_intersect_midpoint.bed') outter = tss.window(intermid,w=windowsize).moveto(outdir + windowname + '_' + currout + '_insite.bed')
def main(use_config=True, bed1=None, bed2=None, method=None, tempdir=None, md=None, largewindow=None, scanner=None, debug=False, label1=None, label2=None, jobid=None): '''This is the main script of the combine function that is called within TFEA. Default arguments are assigned to variables within config.vars. Parameters ---------- use_config : boolean Whether to use a config module to assign variables. bed1 : list A list of strings specifying full paths to bed files corresponding to a single condition (replicates) bed2 : list A list of strings specifying full paths to bed files corresponding to a single condition (replicates) method : str Method for combining input bed files into a single bed file tempdir : str Full path to a directory where files will be saved md : boolean Whether md-score bed files are generated largewindow : int Half-length of window size to use when generating md-score related bed files scanner : str Scanner method to use in SCANNER module. Only needed if md also specified. If equal to 'genome hits', md bed files generated will be only contain one base and be centered at the middle of the region Returns ------- None - Assigns varaibles within config if use_config set to True Raises ------ FileEmptyError If any resulting file is empty ''' start_time = time.time() if use_config: bed1 = config.vars['BED1'] bed2 = config.vars['BED2'] method = config.vars['COMBINE'] tempdir = config.vars['TEMPDIR'] md = config.vars['MD'] md_bedfile1 = config.vars['MD_BEDFILE1'] md_bedfile2 = config.vars['MD_BEDFILE2'] largewindow = config.vars['LARGEWINDOW'] scanner = config.vars['SCANNER'] label1 = config.vars['LABEL1'] label2 = config.vars['LABEL2'] debug = config.vars['DEBUG'] jobid = config.vars['JOBID'] print("Combining Regions...", end=' ', flush=True, file=sys.stderr) if md_bedfile1 and md_bedfile2: centered_md_bedfile1 = tempdir / 'md_bedfile1.centered.bed' centered_md_bedfile2 = tempdir / 'md_bedfile2.centered.bed' md = md and (not md_bedfile1 or not md_bedfile2 ) #Boolean to determine whether to generate MD bed files md_pybedtool1 = BedTool(str(md_bedfile1)) md_pybedtool1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(centered_md_bedfile1) md_pybedtool2 = BedTool(str(md_bedfile2)) md_pybedtool2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(centered_md_bedfile2) if use_config: config.vars['MD_BEDFILE1'] = centered_md_bedfile1 config.vars['MD_BEDFILE2'] = centered_md_bedfile2 #Use MuMerge to merge bed files if method == 'mumerge': mumerge_input = tempdir / 'mumerge_input.txt' combined_file = tempdir / 'combined_file.mumerge' #Write MuMerge input file # with open(mumerge_input, 'w') as F: # F.write("#file\tsampid\tgroup\n") # for i,bedpath in enumerate(bed1, 1): # F.write(f'{bedpath}\t{label1}{i}\t{label1}\n') # for i,bedpath in enumerate(bed2, 1): # F.write(f'{bedpath}\t{label2}{i}\t{label2}\n') #MuMerge Command - output to combined_file.mumerge.bed combined_file = mumerge(mumerge_input, combined_file, bed1=bed1, bed2=bed2, label1=label1, label2=label2) clean_combined_file = tempdir / 'combined_file.mumerge.clean.bed' combined_pybedtool = BedTool(str(combined_file)) combined_pybedtool.remove_invalid().saveas(clean_combined_file) combined_file = clean_combined_file # combined_file = Path(str(combined_file) + '_MUMERGE.bed') #Perform simple merge same as merge all for md bed files if md: md_bedfile1 = tempdir / "md_bedfile1.mumerge" md_mumerge_input1 = tempdir / "md_mumerge_input1.txt" md_bedfile1 = mumerge(md_mumerge_input1, md_bedfile1, bed1=bed1, label1=label1, label2=label2) md_pybedtool1 = BedTool(str(md_bedfile1)) md_bedfile1 = tempdir / "md_bedfile1.mumerge.final.bed" md_pybedtool1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) md_bedfile2 = tempdir / "md_bedfile2.mumerge" md_mumerge_input2 = tempdir / "md_mumerge_input2.txt" md_bedfile2 = mumerge(md_mumerge_input2, md_bedfile2, bed2=bed2, label1=label1, label2=label2) md_pybedtool2 = BedTool(str(md_bedfile2)) md_bedfile2 = tempdir / "md_bedfile2.mumerge.final.bed" md_pybedtool2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4) # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4) # md_merged_bed1.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_merged_bed2.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) #Merge all bed regions, for MD merge condition replicates elif method == 'mergeall': combined_file = tempdir / "combined_file.mergeall.bed" merged_bed = merge_bed(beds=bed1 + bed2) # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.merge.bed" md_bedfile2 = tempdir / "md_bedfile2.merge.bed" # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_merged_bed1 = merge_bed(beds=bed1).each( featurefuncs.extend_fields, 4) md_merged_bed2 = merge_bed(beds=bed2).each( featurefuncs.extend_fields, 4) md_merged_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_merged_bed1.saveas(md_bedfile1) md_merged_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_merged_bed2.saveas(md_bedfile2) elif method == 'tfitclean': # combined_file = tfit_clean(beds=bed1+bed2, tempdir=tempdir) combined_file = tempdir / "combined_file.tfitclean.bed" size_cut = 200 cleaned_bed = clean_bed(beds=bed1 + bed2, size_cut=size_cut) # cleaned_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) cleaned_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.clean.bed" md_bedfile2 = tempdir / "md_bedfile2.clean.bed" md_cleaned_bed1 = clean_bed(beds=bed1) md_cleaned_bed2 = clean_bed(beds=bed2) # md_cleaned_bed1.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1) md_cleaned_bed1.saveas(combined_file) # md_cleaned_bed2.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2) md_cleaned_bed2.saveas(combined_file) #Intersect all bed regions, for MD intersect condition replicates elif method == 'intersectall': combined_file = tempdir / 'combined_file.intersectall.bed' intersected_bed = intersect_bed(beds=bed1 + bed2) # intersected_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) intersected_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.intersect.bed" md_bedfile2 = tempdir / "md_bedfile2.intersect.bed" md_intersected_bed1 = intersect_bed(beds=bed1) md_intersected_bed2 = intersect_bed(beds=bed2) md_intersected_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_intersected_bed1.saveas(combined_file) md_intersected_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_intersected_bed2.saveas(combined_file) #Merge all regions, filter small regions. For MD perform this for each condition elif method == 'tfitremovesmall': # combined_file = tfit_remove_small(beds=bed1+bed2, tempdir=tempdir) size_cut = 200 combined_file = tempdir / "combined_file.mergeallnosmall.bed" merged_bed = merge_bed(beds=bed1 + bed2) # merged_bed.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.filter(lambda b: b.stop - b.start > size_cut).saveas( combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.merge.bed" md_bedfile2 = tempdir / "md_bedfile2.merge.bed" md_merged_bed1 = merge_bed(beds=bed1) md_merged_bed2 = merge_bed(beds=bed2) # md_merged_bed1.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1) md_merged_bed1.filter( lambda b: b.stop - b.start > size_cut).saveas(combined_file) # md_merged_bed2.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2) md_merged_bed2.filter( lambda b: b.stop - b.start > size_cut).saveas(combined_file) #Intersect replicates, merge conditions. For MD intersect condition replicates elif method == 'intersect/merge': # combined_file = intersect_merge_bed(bed1=bed1, bed2=bed2, tempdir=tempdir) combined_file = tempdir / 'combined_file.intermerge.bed' intersected_bed1 = intersect_bed(beds=bed1) intersected_bed2 = intersect_bed(beds=bed2) merged_bed = intersected_bed1.cat(intersected_bed2).merge().sort() # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.intersect.bed" md_bedfile2 = tempdir / "md_bedfile2.intersect.bed" md_intersected_bed1 = intersect_bed(beds=bed1).each( featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_intersected_bed2 = intersect_bed(beds=bed2).each( featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_intersected_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_intersected_bed1.saveas(md_bedfile1) md_intersected_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_intersected_bed2.saveas(md_bedfile2) else: raise exceptions.InputError("Error: COMBINE option not recognized.") #Check to make sure no files are empty if os.stat(combined_file).st_size == 0: raise exceptions.FileEmptyError( "Error in COMBINE module. Resulting bed file is empty.") if md: if os.stat(md_bedfile1).st_size == 0 or os.stat( md_bedfile2).st_size == 0: raise exceptions.FileEmptyError( "Error in COMBINE module. Resulting md bed file is empty.") if use_config: #Assign MD_BEDFILE variables in config config.vars['MD_BEDFILE1'] = md_bedfile1 config.vars['MD_BEDFILE2'] = md_bedfile2 #Assign COMBINED_FILE variable in config if use_config: config.vars['COMBINED_FILE'] = combined_file #Record time, print total_time = time.time() - start_time if use_config: config.vars['COMBINEtime'] = total_time print("done in: " + str(datetime.timedelta(seconds=int(total_time))), ". Processing", len(combined_file.read_text().split('\n')), "regions", file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid)
def newber(binner,factor): """Returns bed record with all binding for a factor merged and renamed.""" newbie = BedTool(binner).sort().merge(nms=True).each(featurefuncs.midpoint) newbie = newbie.each(featurefuncs.rename,factor) return(newbie)