def test_call(): tmp = os.path.join(pybedtools.get_tempdir(), 'test.output') from pybedtools.helpers import call_bedtools, BEDToolsError assert_raises(BEDToolsError, call_bedtools, *(['intersectBe'], tmp)) a = pybedtools.example_bedtool('a.bed') # momentarily redirect stderr to file so the error message doesn't spew all # over the place when testing orig_stderr = sys.stderr sys.stderr = open(a._tmp(), 'w') #assert_raises(BEDToolsError, a.intersect, a=a.fn, b=a.fn, z=True) sys.stderr = orig_stderr pybedtools.set_bedtools_path('nonexistent') a = pybedtools.example_bedtool('a.bed') assert_raises(OSError, a.intersect, a) pybedtools.set_bedtools_path() assert a.intersect(a,u=True) == a
def test_call(): tmp = os.path.join(pybedtools.get_tempdir(), "test.output") from pybedtools.helpers import call_bedtools, BEDToolsError with pytest.raises(BEDToolsError): call_bedtools(*(["intersectBe"], tmp)) a = pybedtools.example_bedtool("a.bed") # momentarily redirect stderr to file so the error message doesn't spew all # over the place when testing orig_stderr = sys.stderr sys.stderr = open(a._tmp(), "w") sys.stderr = orig_stderr pybedtools.set_bedtools_path("nonexistent") a = pybedtools.example_bedtool("a.bed") with pytest.raises(NotImplementedError): a.intersect(a) pybedtools.set_bedtools_path() a = pybedtools.example_bedtool("a.bed") assert a.intersect(a, u=True) == a
def __init__(self, arg, log, *array, **dicts): self.arg = arg self.log = log self.array = array self.dicts = dicts self.arg.scriptdir = os.path.dirname(os.path.realpath(__file__)) self.arg.datadir = self.arg.scriptdir + '/../Data/' self.arg.Bam = '%s/%s'%(self.arg.indir, self.arg.bamdir) self.arg.Fetch = '%s/%s'%(self.arg.outdir, self.arg.fetchdir) self.arg.Search = '%s/%s'%(self.arg.outdir, self.arg.searchdir) self.arg.Merge = '%s/%s'%(self.arg.outdir, self.arg.mergedir) self.arg.Region = '%s/%s'%(self.arg.outdir, self.arg.regiondir) self.arg.Cheak = '%s/%s'%(self.arg.outdir, self.arg.checkdir) self.arg.CNV = '%s/%s'%(self.arg.outdir, self.arg.cnvdir) if self.arg.commands == 'Auto': self.arg.Pipe = self.arg.pipeline else: self.arg.Pipe = self.arg.commands CORES = multiprocessing.cpu_count()*0.8 if multiprocessing.cpu_count() >8 else 8 os.environ['NUMEXPR_MAX_THREADS'] = '1000' #str(int(CORES)) os.environ['PATH'] += ':' + self.arg.bedtools importlib.reload(bt) bt.set_bedtools_path(self.arg.bedtools)
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if(executiongranted<>None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage(coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if(len(coverage)>1): if not bedTools: # Own method # print 'Own method' chromosomes={} allKeys=coverage.keys() for currentKey in allKeys: chromosomes[currentKey[0]]=1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 (Non-standard BED) finalBed.load_custom(-1) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile=file(reference,'r') storeSequence=False wholeChromosome='' currentChromosome='' gccontent={} for line in fastaFile: # Read each line of the fasta file if line.startswith('>'): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence=False currentChromosome=re.split(' +',line)[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome=currentChromosome.split('>')[1].strip() # Chromosome string if(currentChromosome in chromosomes): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence=True wholeChromosome='' # To store whole sequence for the current chromosome elif (not re.search('>',line) and storeSequence): wholeChromosome=wholeChromosome+line.rstrip() # Remove '\n' from current line and concatenates to wholeChromosome if(storeSequence): # For the last chromosome currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids=[] region_ids = coverage.keys() if(len(gccontent)==0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file '+reference+' does not match with ' print ' the target file '+bedfilename+'. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed=bed_file.bed_file(bedfilename) sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed=sortedBed.non_overlapping_exons(1) # base one!!! finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd=bedfd.remove_invalid() # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids=[] for currentKey in coverage.keys(): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1]!=currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if(len(graphtitle)>25): ax.set_title(graphtitle[:25]+'...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if(status<>None): meanvalue = gccontentarray.mean() status.value = (meanvalue>=45 and meanvalue<=55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if(executiongranted<>None): executiongranted.release()
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if(not os.path.isfile(filename+'.bai')): print 'Creating index for '+filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is '+filelist[sizes.argmin()]+' and contains '+str(minsize)+' reads.' # Process each file and store counting results for i,bamfile in enumerate(bamlist): print 'Processing '+bamfile.filename print 'Results will be written at '+fileoutlist[i] # Check whether normalization should be run if(normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP+'/'+pid+'.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH+'coverageBed -d -abam '+normalizedbam.filename+' -b '+bedfilelist[i]+' > '+coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100 print ' Done.' fig = pyplot.figure(figsize=(13,6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6,6)) ax = fig.add_subplot(111) sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)]) cbar.ax.set_yticklabels(['Low','High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools) Input: coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format bedfilename: target file -> assumes original-standard bed file reference: fasta file with reference genome fileout: string containing the full path of the bmp file where the restulting figure will be saved. bedTools: whether pybedtools are used instead of the own method Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" if (executiongranted <> None): executiongranted.acquire() pid = str(os.getpid()) # print 'Processing '+coveragefile # print 'Results will be written at '+fileout coverage = region_coverage( coveragefile) # Calculate mean coverage per region ## fdw=file('regionCoverage.txt','w') ## for element in sorted(coverage.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() if (len(coverage) > 1): if not bedTools: # Own method # print 'Own method' chromosomes = {} allKeys = coverage.keys() for currentKey in allKeys: chromosomes[currentKey[ 0]] = 1 # Stores all chromosomes to be examined (the ones contained in the target file) # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1 ) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED) finalBed = nonOverlappingBed.my_sort_bed( ) # BED file in base 1 (Non-standard BED) finalBed.load_custom( -1 ) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED) #Load FASTA file fastaFile = file(reference, 'r') storeSequence = False wholeChromosome = '' currentChromosome = '' gccontent = {} for line in fastaFile: # Read each line of the fasta file if line.startswith( '>' ): # New chromosome starts -> reading a new line until another '>' is found # print 'Processing ' +line+'\n' if storeSequence: # a chromosome has been read run gc bias currentGCcontent = measureGCbias( wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary storeSequence = False currentChromosome = re.split( ' +', line )[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 currentChromosome = currentChromosome.split( '>')[1].strip() # Chromosome string if ( currentChromosome in chromosomes ): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file storeSequence = True wholeChromosome = '' # To store whole sequence for the current chromosome elif (not re.search('>', line) and storeSequence): wholeChromosome = wholeChromosome + line.rstrip( ) # Remove '\n' from current line and concatenates to wholeChromosome if (storeSequence): # For the last chromosome currentGCcontent = measureGCbias(wholeChromosome, currentChromosome, finalBed) gccontent.update(currentGCcontent) # Update dictionary fastaFile.close() region_ids = [] region_ids = coverage.keys() if (len(gccontent) == 0): print 'ERROR: G+C content values can not be calculated. Probably the provided reference file ' + reference + ' does not match with ' print ' the target file ' + bedfilename + '. That is, sequences of regions in the target file are probably not included within the' print ' reference file.' sys.exit(1) else: print 'Calculating nt content by means of pybedtools...' bed = bed_file.bed_file(bedfilename) sortedBed = bed.my_sort_bed() # Sort bed avoiding bedtools nonOverlappingBed = sortedBed.non_overlapping_exons( 1) # base one!!! finalBed = nonOverlappingBed.my_sort_bed() # BED file in base 1 bedfd = pybedtools.BedTool(finalBed.filename) bedfd = bedfd.remove_invalid( ) # Remove negative coordinates or features with length=0, which do not work with bedtools pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(reference) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof( entry.fields[-8]) * 100 print ' Done.' # gccontent keys in dictionary: chromosome, exon init, exon end region_ids = [] for currentKey in coverage.keys( ): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them) if currentKey[1] != currentKey[2]: region_ids.append(currentKey) ## ## fdw=file('gcContent.txt','w') ## for element in sorted(gccontent.keys()): ## fdw.write(str(element)+'\n') ## fdw.close() ## #region_ids = gccontent.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] # fig = pyplot.figure(figsize=(6,6)) # ax = fig.add_subplot(111) # # ax.hist(gccontentarray,bins=100) # fig.suptitle('Dsitribution of GC content regardless of coverage value') # ax.set_ylabel('Frequency') # ax.set_xlabel('GC content') # ax.set_xlim(0, 100) # fig.savefig('distribution.png') xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') if (len(graphtitle) > 25): ax.set_title(graphtitle[:25] + '...') else: ax.set_title(graphtitle) fig.savefig(fileout) matplotlib.pyplot.close(fig) if (status <> None): meanvalue = gccontentarray.mean() status.value = (meanvalue >= 45 and meanvalue <= 55) else: print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.' if (executiongranted <> None): executiongranted.release()
def gcbias(filelist, fileoutlist, bedfilelist): """************************************************************************************************************************************************************ Task: draws coverage as a function of gc content Input: filelist: list of strings, each containing the full path of the bam file to analyze. fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved. bedfilelist: Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved. ************************************************************************************************************************************************************""" pid = str(os.getpid()) numpy.random.seed(1) ntotal_positions = [] bamlist = [] # Process each file and store counting results for filename in filelist: # Check whether index already exists for the bam file, needed for pysam use if (not os.path.isfile(filename + '.bai')): print 'Creating index for ' + filename pysam.index(filename) print ' Done.' bamlist.append(bam_file.bam_file(filename)) sizes = numpy.array([bam.nreads() for bam in bamlist]) minsize = sizes.min() print 'The smaller bam is ' + filelist[ sizes.argmin()] + ' and contains ' + str(minsize) + ' reads.' # Process each file and store counting results for i, bamfile in enumerate(bamlist): print 'Processing ' + bamfile.filename print 'Results will be written at ' + fileoutlist[i] # Check whether normalization should be run if (normalize): normalizedbam = bamfile.normalize(minsize) else: normalizedbam = bamfile coveragefile = TMP + '/' + pid + '.coverage' print 'Calculating coverage per position...' run(BEDTOOLSPATH + 'coverageBed -d -abam ' + normalizedbam.filename + ' -b ' + bedfilelist[i] + ' > ' + coveragefile) coverage = region_coverage(coveragefile) print 'Calculating nt content...' bedfd = pybedtools.BedTool(bedfilelist[i]) pybedtools._bedtools_installed = True pybedtools.set_bedtools_path(BEDTOOLSPATH) ntcontent = bedfd.nucleotide_content(REF) # Each entry in ntcontent is parsed to extract the gc content of each exon gccontent = {} for entry in ntcontent: gccontent[(entry.fields[0], string.atoi( entry.fields[1]), string.atoi( entry.fields[2]))] = string.atof(entry.fields[-8]) * 100 print ' Done.' fig = pyplot.figure(figsize=(13, 6)) ax = fig.add_subplot(111) region_ids = coverage.keys() coveragearray = numpy.array([coverage[id] for id in region_ids]) gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1] xmin = gccontentarray.min() xmax = gccontentarray.max( ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] ymin = coveragearray.min() ymax = coveragearray.max() # Perform a kernel density estimator on the results X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = c_[X.ravel(), Y.ravel()] values = c_[gccontentarray, coveragearray] kernel = stats.kde.gaussian_kde(values.T) Z = reshape(kernel(positions.T).T, X.T.shape) fig = pyplot.figure(figsize=(6, 6)) ax = fig.add_subplot(111) sc = ax.imshow( rot90(Z), cmap=cm.gist_earth_r, extent=[xmin, 100, ymin, ymax], aspect="auto" ) # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100] cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)]) cbar.ax.set_yticklabels(['Low', 'High']) cbar.set_label('Density') ax.set_xlabel('GC content (%)') ax.set_ylabel('Mean coverage') fig.savefig(fileoutlist[i]) matplotlib.pyplot.close(fig) print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
from pybedtools import BedTool, set_bedtools_path from nrlbio.pybedtools_extension import doublebed2dict, generate_overlaping_intervals from nrlbio.interaction import Interaction parser = argparse.ArgumentParser(description='converts bed-like file of chimeric reads into interactions. That is merging chimeras with intersecting regions'); parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to chimeras bed-like file"); parser.add_argument('-d', '--distance', nargs = '?', default = -12, type = int, help = "minimum overlap(negative number)/maximum distance(positive number) in nucleotides to merge intervals"); parser.add_argument('-n', '--name', nargs = '?', required = True, type = str, help = "name for interactions") parser.add_argument('-oi', '--interactions', nargs = '?', required = True, type = str, help = "path to output interactions file") parser.add_argument('-od', '--dictionary', nargs = '?', required = True, type = str, help = "path to output \"interaction to read id\" file") parser.add_argument('--order', nargs = '?', default = False, const=True, type = int, help = "keeps order of left to right parts in interactions"); parser.add_argument('--bedtools', nargs = '?', default = '', type = str, help = "path to a bedtools binaries"); args = parser.parse_args(); set_bedtools_path(path=args.bedtools) def intervals2interaction(intervals, distance, number, order=False): if(order): merged_regions = [[], []]; for interval in intervals: merged_regions[int(interval.name.split("|")[-1])].append(interval) else: intervals.sort(key = attrgetter('chrom','strand','start')); merged_regions = list(generate_overlaping_intervals(intervals, distance)); if(len(merged_regions)==2): return Interaction.from_intervals("%s_%d" % (args.name, number), merged_regions)
from operator import itemgetter from sequencing_tools.stats_tools import p_adjust from sequencing_tools.fastq_tools import reverse_complement from concensus_seq import concensus from exon_coverage import ExonFilter import pyBigWig as pbw from tblout_parser import read_tbl from sequencing_tools.bam_tools import get_strand import pyximport pyximport.install() from junction_function import get_junction import dask.dataframe as dd import pyranges as pr import io from collections import Counter set_bedtools_path('/stor/work/Lambowitz/cdw2854/src/miniconda3/bin') set_tempdir('/stor/scratch/Lambowitz/cdw2854') class GeneMapper(): def __init__(self): self.HB_genes = '''chr11,5246694,5250625,HBB,- chr11,5253908,5264887,HBD,- chr11,5269309,5271089,HBG1,- chr11,5274420,5526835,HBG2,- chr11,5289575,5526882,HBE1,- chr16,202686,204502,HBZ,+ chr16,203891,216767,HBM,+ chr14,50037702,50065882,RPS29,- chr16,222875,223709,HBA2,+