args = docopt.docopt(__doc__, version='v1') args['<threads>'] = int(args['<threads>']) args['<prefix>'] = args['<prefix>'].split(',') args['<indir>'] = args['<indir>'].split(',') if not args['<outbam>'].endswith('.bam'): raise ValueError("Output file must end with '.bam'") args['<outbam>'] = os.path.abspath(args['<outbam>']) # Parse module file pmDict = slurm.parsePathModule(args['<modules>']) # Find fastq files read1List = [] read2List = [] if args['--unpaired']: for prefix in args['<prefix>']: read1 = fastqFind.findFastq(prefix=prefix, dirList=args['<indir>'], pair=False) read1List.extend(read1) read1List = [os.path.abspath(x) for x in read1List] else: for prefix in args['<prefix>']: read1, read2 = fastqFind.findFastq(prefix=prefix, dirList=args['<indir>'], pair=True) read1List.extend(read1) read2List.extend(read2) read1List = [os.path.abspath(x) for x in read1List] read2List = [os.path.abspath(x) for x in read2List] # Raise Error if no Fastq files identified if len(read1List) == 0: raise IOError('Failed to find FASTQ files')
print '%s\n' % (' '.join(sys.argv)) ############################################################################### ## Process command line arguments and create output directories ############################################################################### # Extract arguments args = docopt.docopt(__doc__, version='v1') # Extract sample prefix and name = args['<sampledata>'].split(',') args['prefix'], args['name'] = args['<sampledata>'].split(',') # Check supplied files toolbox.check_var(args['<gtf>'], 'file') toolbox.check_var(args['<rrna>'], 'file') # Extract fastq files and check if args['--singleend']: args['read1'] = fastqFind.findFastq(prefix=args['prefix'], dirList=args['<indir>'].split(','), pair=False) else: args['read1'], args['read2'] = fastqFind.findFastq( prefix=args['prefix'], dirList=args['<indir>'].split(','), pair=True) if len(args['read1']) != len(args['read2']): raise IOError('Unequal number of FASTQ files identified') if len(args['read1']) < 1: raise IOError('Insufficient number of FASTQ files identified') # Convert numerical arguments args['--threads'] = int(args['--threads']) args['--forprob'] = float(args['--forprob']) args['--minlength'] = int(args['--minlength']) args['--trimqual'] = int(args['--trimqual']) # Generate and store standard output directories args['fastqDir'] = os.path.join(args['<outdir>'], 'fastq')
--quality=<quality> Trimming quality [default: 20] --adapter=<adapter> Adapter sequence [default: AGATCGGAAGAGC] --path=<path> Path to cutadapt [default: cutadapt] --help Output this message """ # Import required modules import os from ngs_python.fastq import fastqFind, fastqTrim from general_python import docopt, toolbox, moab # Extract and process arguments args = docopt.docopt(__doc__,version = 'v1') args['--quality'] = int(args['--quality']) inDir, inPrefix = os.path.split(args['<inprefix>']) toolbox.checkArg(args['--path'], 'exc') # Extract fastq files and generate output file names read1In, read2In = fastqFind.findFastq(prefix = inPrefix, dirList = [inDir], pair = True, gzip = True) read1Out = args['<outprefix>'] + '.R1.fastq.gz' read2Out = args['<outprefix>'] + '.R2.fastq.gz' trimLog = args['<outprefix>'] + '.log' # Generate and submit trim command trimCommand = fastqTrim.cutadaptTrimPaired(read1In = read1In, read2In = read2In, read1Out = read1Out, read2Out = read2Out, quality = args['--quality'], adapter = 'AGATCGGAAGAGC', length = 25, path = args['--path'] ) jobID = moab.submitJob(trimCommand, stdout = trimLog, stderr = trimLog) print jobID
'maximum distance of concordant pairs: %s' %(args.maxSize), 'remove duplicate pairs: %s' %(args.rmDuplicates), 'remove concordant pairs: %s' %(args.rmConcordant) ) # Create output file names args.logFile = args.outDir + args.sampleName + '.log' args.outFastq = args.outDir + args.sampleName + '_trimmed.fastq.gz' args.nameSortBam = args.outDir + args.sampleName + "_nSort.bam" args.outPairs = args.outDir + args.sampleName + ".readPairs.gz" args.outFrags = args.outDir + args.sampleName + ".fragLigations.gz" ############################################################################### ## Process FASTQ files and perform alignment ############################################################################### # Extract fastq file names args.read1, args.read2 = fastqFind.findFastq(prefix = args.fastqPrefix, dirList = args.fastqDir.split(','), pair = True) if len(args.read1) > 1 or len(args.read2) > 1: raise NotImplemented('Multiple FASTQ file input not implemented') # Trim and merge fastq files pf = fastqIO.parseFastq( fastq1 = args.read1[0], fastq2 = args.read2[0] ) trimMetrics = pf.interleave_trim_reads( outFastq = args.outFastq, trim = args.cutSite, minLength = args.minLength ) # Print trim metrics print '\nTrim Metrics:\n\t%s\n\t%s\n\t%s\n\t%s' %( 'total: ' + str(trimMetrics['total']),
--path=<path> Path to cutadapt [default: cutadapt] --help Output this message """ # Import required modules import os from ngs_python.fastq import fastqFind, fastqTrim from general_python import docopt, toolbox, moab # Extract and process arguments args = docopt.docopt(__doc__, version='v1') args['--quality'] = int(args['--quality']) inDir, inPrefix = os.path.split(args['<inprefix>']) toolbox.checkArg(args['--path'], 'exc') # Extract fastq files and generate output file names read1In, read2In = fastqFind.findFastq(prefix=inPrefix, dirList=[inDir], pair=True, gzip=True) read1Out = args['<outprefix>'] + '.R1.fastq.gz' read2Out = args['<outprefix>'] + '.R2.fastq.gz' trimLog = args['<outprefix>'] + '.log' # Generate and submit trim command trimCommand = fastqTrim.cutadaptTrimPaired(read1In=read1In, read2In=read2In, read1Out=read1Out, read2Out=read2Out, quality=args['--quality'], adapter='AGATCGGAAGAGC', length=25, path=args['--path']) jobID = moab.submitJob(trimCommand, stdout=trimLog, stderr=trimLog) print jobID
############################################################################### ## Process command line arguments and create output directories ############################################################################### # Extract arguments args = docopt.docopt(__doc__,version = 'v1') # Extract sample prefix and name = args['<sampledata>'].split(',') args['prefix'], args['name'] = args['<sampledata>'].split(',') # Check supplied files toolbox.check_var(args['<gtf>'], 'file') toolbox.check_var(args['<rrna>'], 'file') # Extract fastq files and check if args['--singleend']: args['read1'] = fastqFind.findFastq( prefix = args['prefix'], dirList = args['<indir>'].split(','), pair = False ) else: args['read1'], args['read2'] = fastqFind.findFastq( prefix = args['prefix'], dirList = args['<indir>'].split(','), pair = True ) if len(args['read1']) != len(args['read2']): raise IOError('Unequal number of FASTQ files identified') if len(args['read1']) < 1: raise IOError('Insufficient number of FASTQ files identified') # Convert numerical arguments args['--threads'] = int(args['--threads']) args['--forprob'] = float(args['--forprob'])
concatFastq.py prefix <outfastq> <prefix> concatFastq.py specify <outfastq> <infastq>.. ''' # Import modules import os import sys from general_python import moab, docopt, toolbox from ngs_python.fastq import fastqFind # Extract arguments args = docopt.docopt(__doc__, version = 'v1') # Find FASTQ files by prefix if args['prefix']: indir, prefix = os.path.split(args['<prefix>']) print indir, prefix args['<infastq>'] = fastqFind.findFastq(prefix = prefix, dirList = [indir], pair = False) args['<infastq>'].sort() # Check number of FASTQ files if len(args['<infastq>']) < 2: sys.exit('\nCannot concatenate %s files\n' %(len(args['<infastq>']))) # Check output file doesnt exist if os.path.isfile(args['<outfastq>']): sys.exit('\nOutput file exists. No command submitted\n') # Print input and out files print '\nInput files:\n%s\n\nOutput file:\n%s\n' %( '\n'.join(args['<infastq>']), args['<outfastq>']) # Get user response before concatenation print "Enter 'concat' to concatenate: " response = raw_input() # Submit command if response == 'concat':
args['prefix'], args['name'] = args['<sampledata>'].split(',') # Split input directories into a list args['<indir>'] = args['<indir>'].split(',') # Read in path file paths ={} with open(args['<pathfile>'], 'r') as pfile: for line in pfile: program, path = line.strip().split('\t') paths[program] = path # Create folder for log files args['logdir'] = os.path.join(args['<outdir>'], args['name'] + '_log') if not os.path.isdir(args['logdir']): os.mkdir(args['logdir']) # Find fastq files read1, read2 = fastqFind.findFastq( prefix = args['prefix'], dirList = args['<indir>'], pair = True, gzip = True ) if len(read1) != 1 and len(read2) != 1: raise IOError('Failure to find single paired FASTQ files') # Generate output files bamPrefix = os.path.join(args['<outdir>'], args['name']) logPrefix = os.path.join(args['logdir'], args['name']) outfiles = { 'initialbam' : bamPrefix + '.bam', 'dedupbam' : bamPrefix + '_dedup.bam', 'realignbam' : bamPrefix + '_dedup_realign.bam', 'recalbam' : bamPrefix + '_dedup_realign_recal.bam', 'listfile' : logPrefix + '_target.list', 'bsqrfile' : logPrefix + '_bsqr.grp', 'alignlog' : logPrefix + '_align.log', 'deduplog1' : logPrefix + '_dedup_1.log',