atacBam2Bed.py (-h | --help) Options: --minMapQ=<minMapQ> Minimum mapping quality of reads [default: 20] --size=<size> Size of open region around insertion [default: 50] --rmDup Flag to remove duplicate reads --help Output this message """ import pysam import collections from general_python import docopt from ngs_python.bam import pysamfunc # Extract arguments args = docopt.docopt(__doc__,version = 'v1') # Process size argument try: args['--size'] = int(args['--size']) except ValueError: raise IOError('size argument must be an integer divisible by 2') if args['--size'] % 2: raise IOError('size argument must be an integer divisible by 2') # Process size argument try: args['--minMapQ'] = int(args['--minMapQ']) except ValueError: raise IOError('minMapQ argument must be an integer') # Create counter to store processing metrics counter = collections.defaultdict(int) # Open input and output files
Usage: meanCoverageIntervals.py <intervals> <outfile> <bam>... [--minmap=<mm>] [--rmdup] [--onebased] [--header] Options: --minmap=<mm> Minimum mapping quality for read [default: 0]. --rmdup Skip duplicate reads in calculating coverage. --onebased Intervals have a one-based start. Otherwise a zero-based start is presumed. ''' # Load required modules from general_python import docopt from ngs_python.bam import pysam_coverage # Extract arguments args = docopt.docopt(__doc__, version='v1') args['--minmap'] = int(args['--minmap']) # Open interval list file and extract data intervalList = [] with open(args['<intervals>']) as intervalFile: for line in intervalFile: chrom, start, end = line.strip().split('\t')[:3] intervalList.append((chrom, int(start), int(end))) # Adjust intervals if they are one based if args['--onebased']: intervalList = [(x[0], x[1] - 1, x[2]) for x in intervalList] # Extract mean coverage for intervals bamCov = pysam_coverage.multiple_coverage(args['<bam>']) outDF = bamCov.mean_coverage(intervals=intervalList, map_quality=args['--minmap'], remove_dup=args['--rmdup'])
'''bam2bedgraph.py Usage: bam2bed.py <bam> <bed> ''' # Import required modules import pysam from general_python import docopt # Extract arguments args = docopt.docopt(__doc__, version='1.0') # Open input bam file and output bed bamFile = pysam.AlignmentFile(args['<bam>']) bedFile = open(args['<bed>'], 'w') # Create chromosome dictionary chromDict = {} for chrom in bamFile.references: chromDict[bamFile.gettid(chrom)] = chrom # Create strand and read dictionary strandDict = {False:'+', True:'-'} readDict = {True:'/1', False:'/2'} # Create output bed for read in bamFile: if read.is_unmapped: continue bedFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %( chromDict[read.reference_id], read.reference_start, read.reference_end, read.query_name + readDict[read.is_read1],
'''ensembl_name_gtf.py Usage: ensembl_name_gtf.py <gtffile> <outfile> ''' # Import required modules import collections import re from general_python import docopt # Extract arguments args = docopt.docopt(__doc__, version='1.0') # Create regular expressions eRE = re.compile('gene_id\s+"(.*?)";') bRE = re.compile('gene_biotype\s+"(.*?)";') # Read in input file to gene dictionary geneDict = collections.defaultdict(set) with open(args['<gtffile>'], 'r') as inFile: for line in inFile: if line.startswith('#'): continue data = line.strip().split('\t')[8] ensembl = re.search(eRE, data).group(1) biotype = re.search(bRE, data).group(1) geneDict[ensembl].add(biotype) # Create output file counter = [0, 0, 0] with open(args['<outfile>'], 'w') as outFile: for ensembl, biotype in geneDict.items(): counter[0] += 1
--minGene=<minGene> Minimum significant genes in GO set [default: 3]. --log2Col=<log2Col> Column for log2 fold change data. Supplying this results in positive and negative fold change genes being considered seperately. --includeCombined Include combined geneset alongside positive and negative genes sets. Only effectice with --log2Col argument. --onlyAnno Only consider genes with annotation. --noHeader Results file has no header. """ # Import required modules from ngs_python.gtf import gene_conversion from general_python import docopt # Extract arguments args = docopt.docopt(__doc__, version="v1") args["<geneCol>"] = int(args["<geneCol>"]) args["<statCol>"] = int(args["<statCol>"]) args["<statMax>"] = float(args["<statMax>"]) args["--minGO"] = int(args["--minGO"]) args["--maxGO"] = int(args["--maxGO"]) if args["--log2Col"] is not None: args["--log2Col"] = int(args["--log2Col"]) # Parse gmt geneAnno = gene_conversion.parse_gmt(args["<gmt>"]) if isinstance(args["--log2Col"], int): # Extract gene list allGenes, posGenes, negGenes = gene_conversion.extract_gene_results_posneg( results=args["<results>"], geneCol=args["<geneCol>"], log2Col=args["--log2Col"],