Esempio n. 1
0
def test_call():
    tmp = os.path.join(pybedtools.get_tempdir(), 'test.output')
    from pybedtools.helpers import call_bedtools, BEDToolsError
    assert_raises(BEDToolsError, call_bedtools, *(['intersectBe'], tmp))

    a = pybedtools.example_bedtool('a.bed')

    # momentarily redirect stderr to file so the error message doesn't spew all
    # over the place when testing
    orig_stderr = sys.stderr
    sys.stderr = open(a._tmp(), 'w')
    #assert_raises(BEDToolsError, a.intersect, a=a.fn, b=a.fn, z=True)
    sys.stderr = orig_stderr

    pybedtools.set_bedtools_path('nonexistent')
    a = pybedtools.example_bedtool('a.bed')
    assert_raises(OSError, a.intersect, a)
    pybedtools.set_bedtools_path()
    assert a.intersect(a,u=True) == a
Esempio n. 2
0
def test_call():
    tmp = os.path.join(pybedtools.get_tempdir(), "test.output")
    from pybedtools.helpers import call_bedtools, BEDToolsError

    with pytest.raises(BEDToolsError):
        call_bedtools(*(["intersectBe"], tmp))

    a = pybedtools.example_bedtool("a.bed")

    # momentarily redirect stderr to file so the error message doesn't spew all
    # over the place when testing
    orig_stderr = sys.stderr
    sys.stderr = open(a._tmp(), "w")
    sys.stderr = orig_stderr

    pybedtools.set_bedtools_path("nonexistent")
    a = pybedtools.example_bedtool("a.bed")
    with pytest.raises(NotImplementedError):
        a.intersect(a)
    pybedtools.set_bedtools_path()
    a = pybedtools.example_bedtool("a.bed")
    assert a.intersect(a, u=True) == a
Esempio n. 3
0
    def __init__(self, arg, log,  *array, **dicts):
        self.arg = arg
        self.log = log
        self.array  = array
        self.dicts  = dicts
        self.arg.scriptdir = os.path.dirname(os.path.realpath(__file__))
        self.arg.datadir   = self.arg.scriptdir + '/../Data/'
        self.arg.Bam    = '%s/%s'%(self.arg.indir, self.arg.bamdir)
        self.arg.Fetch  = '%s/%s'%(self.arg.outdir, self.arg.fetchdir)
        self.arg.Search = '%s/%s'%(self.arg.outdir, self.arg.searchdir)
        self.arg.Merge  = '%s/%s'%(self.arg.outdir, self.arg.mergedir)
        self.arg.Region = '%s/%s'%(self.arg.outdir, self.arg.regiondir)
        self.arg.Cheak  = '%s/%s'%(self.arg.outdir, self.arg.checkdir)
        self.arg.CNV    = '%s/%s'%(self.arg.outdir, self.arg.cnvdir)
        if self.arg.commands == 'Auto':
            self.arg.Pipe = self.arg.pipeline
        else:
            self.arg.Pipe = self.arg.commands

        CORES = multiprocessing.cpu_count()*0.8 if multiprocessing.cpu_count() >8 else 8
        os.environ['NUMEXPR_MAX_THREADS'] = '1000' #str(int(CORES))
        os.environ['PATH'] += ':' + self.arg.bedtools
        importlib.reload(bt)
        bt.set_bedtools_path(self.arg.bedtools)
Esempio n. 4
0
def gcbias_lite(coveragefile, bedfilename, reference, fileout, graphtitle=None, executiongranted=None, status=None, bedTools=False):
	"""************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools)
	Input:
		coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format
		bedfilename: target file -> assumes original-standard bed file
		reference: fasta file with reference genome
		fileout: string containing the full path of the bmp file where the restulting figure will be saved.
		bedTools: whether pybedtools are used instead of the own method
	Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""
	   
	if(executiongranted<>None):
		executiongranted.acquire()
	
	pid = str(os.getpid())
 
#	print 'Processing '+coveragefile
#	print 'Results will be written at '+fileout
	coverage = region_coverage(coveragefile) # Calculate mean coverage per region
	
##	fdw=file('regionCoverage.txt','w')	
##	for element in sorted(coverage.keys()):
##		fdw.write(str(element)+'\n')		
##	fdw.close()

	if(len(coverage)>1):	
		
		if not bedTools:   # Own method
#			print 'Own method'
			chromosomes={}	 
			allKeys=coverage.keys()
			
			for currentKey in allKeys:
				chromosomes[currentKey[0]]=1 # Stores all chromosomes to be examined (the ones contained in the target file)
						
			# Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base
			bed=bed_file.bed_file(bedfilename)
			sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools
			nonOverlappingBed=sortedBed.non_overlapping_exons(1) # Base 1!!! # This generates a BED file in base 1 (Non-standard BED)
			finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1 (Non-standard BED)
			finalBed.load_custom(-1) # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED)	
	
						
			#Load FASTA file		
			fastaFile=file(reference,'r')
			
			storeSequence=False
			wholeChromosome=''
			currentChromosome=''
			gccontent={}		
	
		
			for line in fastaFile: # Read each line of the fasta file
				if line.startswith('>'): # New chromosome starts -> reading a new line until another '>' is found
#					print 'Processing ' +line+'\n' 
					if storeSequence: # a chromosome has been read run gc bias				
						currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed)
						gccontent.update(currentGCcontent) # Update dictionary
						storeSequence=False
					currentChromosome=re.split(' +',line)[0] # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
					currentChromosome=currentChromosome.split('>')[1].strip() # Chromosome string
					if(currentChromosome in chromosomes): # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file
						storeSequence=True
					wholeChromosome='' # To store whole sequence for the current chromosome
				elif (not re.search('>',line) and storeSequence):
					wholeChromosome=wholeChromosome+line.rstrip() # Remove '\n' from current line and concatenates to wholeChromosome
					
	
			if(storeSequence): # For the last chromosome
					currentGCcontent=measureGCbias(wholeChromosome,currentChromosome,finalBed)
					gccontent.update(currentGCcontent)  # Update dictionary
					
			fastaFile.close()  
			region_ids=[]					
			region_ids = coverage.keys()
			
			if(len(gccontent)==0):
				print 'ERROR: G+C content values can not be calculated. Probably the provided reference file '+reference+' does not match with '
				print '	the target file '+bedfilename+'. That is, sequences of regions in the target file are probably not included within the'
				print '	reference file.'
				sys.exit(1)
			   
		else:			
			print 'Calculating nt content by means of pybedtools...'
			bed=bed_file.bed_file(bedfilename)
			sortedBed=bed.my_sort_bed() # Sort bed avoiding bedtools
			nonOverlappingBed=sortedBed.non_overlapping_exons(1) # base one!!! 
			finalBed=nonOverlappingBed.my_sort_bed() # BED file in base 1
			bedfd = pybedtools.BedTool(finalBed.filename)
			bedfd=bedfd.remove_invalid() # Remove negative coordinates or features with length=0, which do not work with bedtools
			pybedtools._bedtools_installed = True
			pybedtools.set_bedtools_path(BEDTOOLSPATH)	
			ntcontent = bedfd.nucleotide_content(reference)
				
			# Each entry in ntcontent is parsed to extract the gc content of each exon
			gccontent = {}
			for entry in ntcontent:
				gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100
			print '	Done.'						
			# gccontent keys in dictionary: chromosome, exon init, exon end   
			
			region_ids=[]
			for currentKey in coverage.keys(): # Pybedtools does not work with regions with zero length -> remove them (there are a few of them)
				if currentKey[1]!=currentKey[2]:
					region_ids.append(currentKey)
						
		
##		
##		fdw=file('gcContent.txt','w')	
##		for element in sorted(gccontent.keys()):
##			fdw.write(str(element)+'\n')		
##		fdw.close()
##			
		#region_ids = gccontent.keys()
		coveragearray = numpy.array([coverage[id] for id in region_ids])
		gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1]	
				
#		fig = pyplot.figure(figsize=(6,6))
#		ax = fig.add_subplot(111)
#		
#		ax.hist(gccontentarray,bins=100)
#		fig.suptitle('Dsitribution of GC content regardless of coverage value')	
#		ax.set_ylabel('Frequency')
#		ax.set_xlabel('GC content')
#		ax.set_xlim(0, 100)
#		fig.savefig('distribution.png')										
					
		xmin = gccontentarray.min()
		xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		ymin = coveragearray.min()
		ymax = coveragearray.max()
		 
		# Perform a kernel density estimator on the results
		X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
		positions = c_[X.ravel(), Y.ravel()]
		values = c_[gccontentarray, coveragearray]
		kernel = stats.kde.gaussian_kde(values.T)
		Z = reshape(kernel(positions.T).T, X.T.shape)
		
		
		fig = pyplot.figure(figsize=(6,6))
		ax = fig.add_subplot(111)
		sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)])
		cbar.ax.set_yticklabels(['Low','High'])
		cbar.set_label('Density')
		ax.set_xlabel('GC content (%)')
		ax.set_ylabel('Mean coverage')
		
		if(len(graphtitle)>25):
			ax.set_title(graphtitle[:25]+'...')
		else:
			ax.set_title(graphtitle)
			
		fig.savefig(fileout)
		matplotlib.pyplot.close(fig)
		
		if(status<>None):
			meanvalue = gccontentarray.mean()
			status.value = (meanvalue>=45 and meanvalue<=55)
		

	else:
		print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.'
		
	if(executiongranted<>None):
		executiongranted.release()
Esempio n. 5
0
def gcbias(filelist, fileoutlist, bedfilelist):
	"""************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content
	Input:
		filelist: list of strings, each containing the full path of the bam file to analyze.
		fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved.
		bedfilelist: 
	Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""
	
	pid = str(os.getpid())
	
	numpy.random.seed(1)
	ntotal_positions = []
	bamlist = []
	
	# Process each file and store counting results
	for filename in filelist:
		# Check whether index already exists for the bam file, needed for pysam use
		if(not os.path.isfile(filename+'.bai')):
			print 'Creating index for '+filename
			pysam.index(filename)
			print '	Done.'
						
		bamlist.append(bam_file.bam_file(filename))
	sizes = numpy.array([bam.nreads() for bam in bamlist])
	minsize = sizes.min()
	
	print 'The smaller bam is '+filelist[sizes.argmin()]+' and contains '+str(minsize)+' reads.'
		
	# Process each file and store counting results
	for i,bamfile in enumerate(bamlist):
	
		print 'Processing '+bamfile.filename
		print 'Results will be written at '+fileoutlist[i]
		
		# Check whether normalization should be run
		if(normalize): normalizedbam = bamfile.normalize(minsize)
		else: normalizedbam = bamfile
		
		coveragefile = TMP+'/'+pid+'.coverage'
		print 'Calculating coverage per position...'
		run(BEDTOOLSPATH+'coverageBed -d -abam '+normalizedbam.filename+' -b '+bedfilelist[i]+' > '+coveragefile)   
	
		coverage = region_coverage(coveragefile)
	
		print 'Calculating nt content...'
		bedfd = pybedtools.BedTool(bedfilelist[i])
		pybedtools._bedtools_installed = True
		pybedtools.set_bedtools_path(BEDTOOLSPATH)	
		ntcontent = bedfd.nucleotide_content(REF)
		
		# Each entry in ntcontent is parsed to extract the gc content of each exon
		gccontent = {}
		for entry in ntcontent:
			gccontent[(entry.fields[0], string.atoi(entry.fields[1]), string.atoi(entry.fields[2]))] = string.atof(entry.fields[-8])*100
		print '	Done.'		
			
		fig = pyplot.figure(figsize=(13,6))
		ax = fig.add_subplot(111)
		
		region_ids = coverage.keys()
		coveragearray = numpy.array([coverage[id] for id in region_ids])
		gccontentarray = numpy.array([gccontent[id] for id in region_ids]) # Values in [0,1]
	
		xmin = gccontentarray.min()
		xmax = gccontentarray.max() # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		ymin = coveragearray.min()
		ymax = coveragearray.max()
		 
		# Perform a kernel density estimator on the results
		X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
		positions = c_[X.ravel(), Y.ravel()]
		values = c_[gccontentarray, coveragearray]
		kernel = stats.kde.gaussian_kde(values.T)
		Z = reshape(kernel(positions.T).T, X.T.shape)
		
		
		fig = pyplot.figure(figsize=(6,6))
		ax = fig.add_subplot(111)
		sc=ax.imshow(rot90(Z),cmap=cm.gist_earth_r,extent=[xmin, 100, ymin, ymax], aspect="auto") # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
		cbar=fig.colorbar(sc,ticks=[numpy.min(Z),numpy.max(Z)])
		cbar.ax.set_yticklabels(['Low','High'])
		cbar.set_label('Density')
		ax.set_xlabel('GC content (%)')
		ax.set_ylabel('Mean coverage')
		fig.savefig(fileoutlist[i])
		matplotlib.pyplot.close(fig)
	
	print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
Esempio n. 6
0
def gcbias_lite(coveragefile,
                bedfilename,
                reference,
                fileout,
                graphtitle=None,
                executiongranted=None,
                status=None,
                bedTools=False):
    """************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content. IMPROVED VERSION of gcbias that avoids the use of bedtools (pybedtools)
	Input:
		coveragefile: string containing the full path of the bam.coverage file to analyze. This file has been built according to 1-base format
		bedfilename: target file -> assumes original-standard bed file
		reference: fasta file with reference genome
		fileout: string containing the full path of the bmp file where the restulting figure will be saved.
		bedTools: whether pybedtools are used instead of the own method
	Output: a png file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""

    if (executiongranted <> None):
        executiongranted.acquire()

    pid = str(os.getpid())

    #	print 'Processing '+coveragefile
    #	print 'Results will be written at '+fileout
    coverage = region_coverage(
        coveragefile)  # Calculate mean coverage per region

    ##	fdw=file('regionCoverage.txt','w')
    ##	for element in sorted(coverage.keys()):
    ##		fdw.write(str(element)+'\n')
    ##	fdw.close()

    if (len(coverage) > 1):

        if not bedTools:  # Own method
            #			print 'Own method'
            chromosomes = {}
            allKeys = coverage.keys()

            for currentKey in allKeys:
                chromosomes[currentKey[
                    0]] = 1  # Stores all chromosomes to be examined (the ones contained in the target file)

            # Load BED file -> since coverage information is in 1-base format, BED format must be transformed to 1-base
            bed = bed_file.bed_file(bedfilename)
            sortedBed = bed.my_sort_bed()  # Sort bed avoiding bedtools
            nonOverlappingBed = sortedBed.non_overlapping_exons(
                1
            )  # Base 1!!! # This generates a BED file in base 1 (Non-standard BED)
            finalBed = nonOverlappingBed.my_sort_bed(
            )  # BED file in base 1 (Non-standard BED)
            finalBed.load_custom(
                -1
            )  # Load chromosome and positions in base 1....(finalBed is in base 1 -> Non-standard BED)

            #Load FASTA file
            fastaFile = file(reference, 'r')

            storeSequence = False
            wholeChromosome = ''
            currentChromosome = ''
            gccontent = {}

            for line in fastaFile:  # Read each line of the fasta file
                if line.startswith(
                        '>'
                ):  # New chromosome starts -> reading a new line until another '>' is found
                    #					print 'Processing ' +line+'\n'
                    if storeSequence:  # a chromosome has been read run gc bias
                        currentGCcontent = measureGCbias(
                            wholeChromosome, currentChromosome, finalBed)
                        gccontent.update(currentGCcontent)  # Update dictionary
                        storeSequence = False
                    currentChromosome = re.split(
                        ' +', line
                    )[0]  # Format: >1 dna:chromosome chromosome:GRCh37:1:1:249250621:1
                    currentChromosome = currentChromosome.split(
                        '>')[1].strip()  # Chromosome string
                    if (
                            currentChromosome in chromosomes
                    ):  # If current chromosome read in the FASTA file is in the list of chromosomes in the BED file
                        storeSequence = True
                    wholeChromosome = ''  # To store whole sequence for the current chromosome
                elif (not re.search('>', line) and storeSequence):
                    wholeChromosome = wholeChromosome + line.rstrip(
                    )  # Remove '\n' from current line and concatenates to wholeChromosome

            if (storeSequence):  # For the last chromosome
                currentGCcontent = measureGCbias(wholeChromosome,
                                                 currentChromosome, finalBed)
                gccontent.update(currentGCcontent)  # Update dictionary

            fastaFile.close()
            region_ids = []
            region_ids = coverage.keys()

            if (len(gccontent) == 0):
                print 'ERROR: G+C content values can not be calculated. Probably the provided reference file ' + reference + ' does not match with '
                print '	the target file ' + bedfilename + '. That is, sequences of regions in the target file are probably not included within the'
                print '	reference file.'
                sys.exit(1)

        else:
            print 'Calculating nt content by means of pybedtools...'
            bed = bed_file.bed_file(bedfilename)
            sortedBed = bed.my_sort_bed()  # Sort bed avoiding bedtools
            nonOverlappingBed = sortedBed.non_overlapping_exons(
                1)  # base one!!!
            finalBed = nonOverlappingBed.my_sort_bed()  # BED file in base 1
            bedfd = pybedtools.BedTool(finalBed.filename)
            bedfd = bedfd.remove_invalid(
            )  # Remove negative coordinates or features with length=0, which do not work with bedtools
            pybedtools._bedtools_installed = True
            pybedtools.set_bedtools_path(BEDTOOLSPATH)
            ntcontent = bedfd.nucleotide_content(reference)

            # Each entry in ntcontent is parsed to extract the gc content of each exon
            gccontent = {}
            for entry in ntcontent:
                gccontent[(entry.fields[0], string.atoi(entry.fields[1]),
                           string.atoi(entry.fields[2]))] = string.atof(
                               entry.fields[-8]) * 100
            print '	Done.'
            # gccontent keys in dictionary: chromosome, exon init, exon end

            region_ids = []
            for currentKey in coverage.keys(
            ):  # Pybedtools does not work with regions with zero length -> remove them (there are a few of them)
                if currentKey[1] != currentKey[2]:
                    region_ids.append(currentKey)

##
##		fdw=file('gcContent.txt','w')
##		for element in sorted(gccontent.keys()):
##			fdw.write(str(element)+'\n')
##		fdw.close()
##
#region_ids = gccontent.keys()
        coveragearray = numpy.array([coverage[id] for id in region_ids])
        gccontentarray = numpy.array([gccontent[id]
                                      for id in region_ids])  # Values in [0,1]

        #		fig = pyplot.figure(figsize=(6,6))
        #		ax = fig.add_subplot(111)
        #
        #		ax.hist(gccontentarray,bins=100)
        #		fig.suptitle('Dsitribution of GC content regardless of coverage value')
        #		ax.set_ylabel('Frequency')
        #		ax.set_xlabel('GC content')
        #		ax.set_xlim(0, 100)
        #		fig.savefig('distribution.png')

        xmin = gccontentarray.min()
        xmax = gccontentarray.max(
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        ymin = coveragearray.min()
        ymax = coveragearray.max()

        # Perform a kernel density estimator on the results
        X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = c_[X.ravel(), Y.ravel()]
        values = c_[gccontentarray, coveragearray]
        kernel = stats.kde.gaussian_kde(values.T)
        Z = reshape(kernel(positions.T).T, X.T.shape)

        fig = pyplot.figure(figsize=(6, 6))
        ax = fig.add_subplot(111)
        sc = ax.imshow(
            rot90(Z),
            cmap=cm.gist_earth_r,
            extent=[xmin, 100, ymin, ymax],
            aspect="auto"
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)])
        cbar.ax.set_yticklabels(['Low', 'High'])
        cbar.set_label('Density')
        ax.set_xlabel('GC content (%)')
        ax.set_ylabel('Mean coverage')

        if (len(graphtitle) > 25):
            ax.set_title(graphtitle[:25] + '...')
        else:
            ax.set_title(graphtitle)

        fig.savefig(fileout)
        matplotlib.pyplot.close(fig)

        if (status <> None):
            meanvalue = gccontentarray.mean()
            status.value = (meanvalue >= 45 and meanvalue <= 55)

    else:
        print 'WARNING: only one region found in the bed file. Skipping GC bias calculation.'

    if (executiongranted <> None):
        executiongranted.release()
Esempio n. 7
0
def gcbias(filelist, fileoutlist, bedfilelist):
    """************************************************************************************************************************************************************
	Task: draws coverage as a function of gc content
	Input:
		filelist: list of strings, each containing the full path of the bam file to analyze.
		fileoutlist: list of strings, each containing the full path of the png file where the corresponding figure will be saved.
		bedfilelist: 
	Output: a bmp file will be created named "fileout" where a graph that compares gc content and mean coverage will be saved.	
	************************************************************************************************************************************************************"""

    pid = str(os.getpid())

    numpy.random.seed(1)
    ntotal_positions = []
    bamlist = []

    # Process each file and store counting results
    for filename in filelist:
        # Check whether index already exists for the bam file, needed for pysam use
        if (not os.path.isfile(filename + '.bai')):
            print 'Creating index for ' + filename
            pysam.index(filename)
            print '	Done.'

        bamlist.append(bam_file.bam_file(filename))
    sizes = numpy.array([bam.nreads() for bam in bamlist])
    minsize = sizes.min()

    print 'The smaller bam is ' + filelist[
        sizes.argmin()] + ' and contains ' + str(minsize) + ' reads.'

    # Process each file and store counting results
    for i, bamfile in enumerate(bamlist):

        print 'Processing ' + bamfile.filename
        print 'Results will be written at ' + fileoutlist[i]

        # Check whether normalization should be run
        if (normalize): normalizedbam = bamfile.normalize(minsize)
        else: normalizedbam = bamfile

        coveragefile = TMP + '/' + pid + '.coverage'
        print 'Calculating coverage per position...'
        run(BEDTOOLSPATH + 'coverageBed -d -abam ' + normalizedbam.filename +
            ' -b ' + bedfilelist[i] + ' > ' + coveragefile)

        coverage = region_coverage(coveragefile)

        print 'Calculating nt content...'
        bedfd = pybedtools.BedTool(bedfilelist[i])
        pybedtools._bedtools_installed = True
        pybedtools.set_bedtools_path(BEDTOOLSPATH)
        ntcontent = bedfd.nucleotide_content(REF)

        # Each entry in ntcontent is parsed to extract the gc content of each exon
        gccontent = {}
        for entry in ntcontent:
            gccontent[(entry.fields[0], string.atoi(
                entry.fields[1]), string.atoi(
                    entry.fields[2]))] = string.atof(entry.fields[-8]) * 100
        print '	Done.'

        fig = pyplot.figure(figsize=(13, 6))
        ax = fig.add_subplot(111)

        region_ids = coverage.keys()
        coveragearray = numpy.array([coverage[id] for id in region_ids])
        gccontentarray = numpy.array([gccontent[id]
                                      for id in region_ids])  # Values in [0,1]

        xmin = gccontentarray.min()
        xmax = gccontentarray.max(
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        ymin = coveragearray.min()
        ymax = coveragearray.max()

        # Perform a kernel density estimator on the results
        X, Y = mgrid[xmin:xmax:100j, ymin:ymax:100j]
        positions = c_[X.ravel(), Y.ravel()]
        values = c_[gccontentarray, coveragearray]
        kernel = stats.kde.gaussian_kde(values.T)
        Z = reshape(kernel(positions.T).T, X.T.shape)

        fig = pyplot.figure(figsize=(6, 6))
        ax = fig.add_subplot(111)
        sc = ax.imshow(
            rot90(Z),
            cmap=cm.gist_earth_r,
            extent=[xmin, 100, ymin, ymax],
            aspect="auto"
        )  # Due to the imshow sentence, we need to rescale gccontent from [0,1] to [0,100]
        cbar = fig.colorbar(sc, ticks=[numpy.min(Z), numpy.max(Z)])
        cbar.ax.set_yticklabels(['Low', 'High'])
        cbar.set_label('Density')
        ax.set_xlabel('GC content (%)')
        ax.set_ylabel('Mean coverage')
        fig.savefig(fileoutlist[i])
        matplotlib.pyplot.close(fig)

    print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Finished <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
Esempio n. 8
0
from pybedtools import BedTool, set_bedtools_path

from nrlbio.pybedtools_extension import doublebed2dict, generate_overlaping_intervals
from nrlbio.interaction import Interaction

parser = argparse.ArgumentParser(description='converts bed-like file of chimeric reads into interactions. That is merging chimeras with intersecting regions');
parser.add_argument('path', metavar = 'N', nargs = '?', type = str, help = "Path to chimeras bed-like file");
parser.add_argument('-d', '--distance', nargs = '?', default = -12, type = int, help = "minimum overlap(negative number)/maximum distance(positive number) in nucleotides to merge intervals");
parser.add_argument('-n', '--name', nargs = '?', required = True, type = str, help = "name for interactions")
parser.add_argument('-oi', '--interactions', nargs = '?', required = True, type = str, help = "path to output interactions file")
parser.add_argument('-od', '--dictionary', nargs = '?', required = True, type = str, help = "path to output \"interaction to read id\" file")
parser.add_argument('--order', nargs = '?', default = False, const=True, type = int, help = "keeps order of left to right parts in interactions");
parser.add_argument('--bedtools', nargs = '?', default = '', type = str, help = "path to a bedtools binaries");
args = parser.parse_args();
set_bedtools_path(path=args.bedtools)



def intervals2interaction(intervals, distance, number, order=False):
	if(order):
		merged_regions = [[], []];
		for interval in intervals:
			merged_regions[int(interval.name.split("|")[-1])].append(interval)			
	else:
		intervals.sort(key = attrgetter('chrom','strand','start'));
		merged_regions = list(generate_overlaping_intervals(intervals, distance));

	if(len(merged_regions)==2):
		return Interaction.from_intervals("%s_%d" % (args.name, number), merged_regions)
Esempio n. 9
0
from operator import itemgetter
from sequencing_tools.stats_tools import p_adjust
from sequencing_tools.fastq_tools import reverse_complement
from concensus_seq import concensus
from exon_coverage import ExonFilter
import pyBigWig as pbw
from tblout_parser import read_tbl
from sequencing_tools.bam_tools import get_strand
import pyximport
pyximport.install()
from junction_function import get_junction
import dask.dataframe as dd
import pyranges as pr
import io
from collections import Counter
set_bedtools_path('/stor/work/Lambowitz/cdw2854/src/miniconda3/bin')
set_tempdir('/stor/scratch/Lambowitz/cdw2854')


class GeneMapper():
    def __init__(self):

        self.HB_genes = '''chr11,5246694,5250625,HBB,-
chr11,5253908,5264887,HBD,-
chr11,5269309,5271089,HBG1,-
chr11,5274420,5526835,HBG2,-
chr11,5289575,5526882,HBE1,-
chr16,202686,204502,HBZ,+
chr16,203891,216767,HBM,+
chr14,50037702,50065882,RPS29,-
chr16,222875,223709,HBA2,+