def submit(self, verbose = False): # Check arguments toolbox.check_var(verbose, 'bool') # Create moab list and dictionary moabList = [] # Extract commands and parameters for commandNo in self.commandDict: # Unpack command and parameters command, processors, stdout, stderr, dependency = ( self.commandDict[commandNo]) # Create msub command and add node informaion msubCommand = ['/opt/moab/bin/msub', '-l'] msubCommand.append('nodes=1:babs:ppn=%s' %(processors)) # Add output information if stdout == stderr: msubCommand.extend(['-j', 'oe', '-o', stdout]) else: msubCommand.extend(['-o', stdout, '-e', stderr]) # Add dependecy dependList = [] for d in dependency: dependList.append(moabList[d][0]) if dependList: depend = 'x=depend:afterok:%s' %(':'.join(dependList)) msubCommand.extend(['-W', depend]) # Create output variable for function moabID = None # Try submit the job ten times for _ in range(10): # Create msub process msubProcess = subprocess.Popen( msubCommand, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE ) # Submit command moab = msubProcess.communicate(input=command)[0] # Search for returned Moab ID moabMatch = re.match('^\s+(Moab\\.\d+)\s+$',moab) # Check that Moab ID has been returned or wait and repeat if moabMatch: moabID = moabMatch.group(1) break else: time.sleep(10) # Store succesfully commoted commands if moabID: moabList.append((moabID, command, processors, stdout, stderr, ' '.join(dependList))) else: raise IOError('Could not submit moab job') # Print commands if requested if verbose: print 'JOB ID: %s\nCOMMAND: %s\nPROCESSORS: %s\nSTDOUT: %s\n'\ 'STDERR: %s\nDEPENDENCIES: %s\n' %(moabID, command, processors, stdout, stderr, ' '.join(dependList)) # Save command to output file return(moabList)
def indel_homopolymer(sequence, position, ref, var, insertion): # Check that insertion argument is boolean toolbox.check_var(sequence, 'str') toolbox.check_var(position, 'int') toolbox.check_var(ref, 'str') toolbox.check_var(var, 'str') toolbox.check_var(insertion, 'bool') # Covert sequences to uppercase sequence = sequence.upper() ref = ref.upper() var = var.upper() # Check that first base of indel is centre of reference sequence if sequence[position] != ref: raise ValueError( 'Position must be first base of indel (1-based index)') # Check that deletions are contained within the reference if not insertion and len(sequence) - position - len(var) < -1: raise ValueError('Deletion must be contained within reference') # Find homopolymer for insertion if insertion: # Extract insertion sequence and find any sub monomers indelSeq = var[1:] indelMonomer, monomerNo = toolbox.find_monomer(indelSeq) # Add insertion to reference inserted = sequence[:position] + indelMonomer + sequence[position:] # Create regex to find homopolymer regex = '^.{0,%s}?((%s)+).{0,%s}?$' % (position, indelMonomer, len(sequence) - position) # Find homopolymer and extract length h**o = re.match(regex, inserted).group(1) homoLengthRef = (len(h**o) / len(indelMonomer)) - 1 homoLengthVar = homoLengthRef + monomerNo # Find homopoly else: # Extract deleted sequence indelSeq = sequence[position:][:len(var)] indelMonomer, monomerNo = toolbox.find_monomer(indelSeq) # Create regex to find homopolymer regex = '^.{0,%s}?((%s)+).{0,%s}?$' % ( position, indelMonomer, len(sequence) - position - len(indelSeq)) # Find homopolymer and extract length h**o = re.match(regex, sequence).group(1) homoLengthRef = len(h**o) / len(indelMonomer) homoLengthVar = homoLengthRef - monomerNo # Return data return (indelMonomer, homoLengthRef, homoLengthVar)
def extractPosition(openBam, chrom, position, minMapQ=20, minBaseQ=20, groupdel=False): # check arguments toolbox.check_var(chrom, 'str') toolbox.check_var(position, 'int', mn=1) toolbox.check_var(minMapQ, 'int', mn=0) toolbox.check_var(minBaseQ, 'int', mn=0) toolbox.check_var(groupdel, 'bool') # Set variables for mapping mapQuality = [] baseCounts = {} # Loop through reads covering BAM for read in openBam.fetch(chrom, position - 1, position): # Store mapping quality and skip reads with low values mapQuality.append(read.mapping_quality) if mapQuality[-1] < minMapQ: continue # Extract base calls for read baseDict = baseCalls(read, groupdel) # Extract reads for base of interest or skip try: base, quality = baseDict[position - 1] except KeyError: continue # Skip bases of poor quality if quality < minBaseQ: continue # Add base to base dictionary if base in baseCounts: baseCounts[base][0] += 1 else: baseCounts[base] = [1, 0] # Add forward strand count to base dictionary if not read.is_reverse: baseCounts[base][1] += 1 # Calculate and return results if len(mapQuality) > 0: meanMap = sum(mapQuality) / len(mapQuality) else: meanMap = 0 return (baseCounts, meanMap)
def extract_fasta(fasta, chrom, start, end): ''' Function to extract sequence from FASTA file using the pysam module. Args: fasta: string of full path to faidx indexed FASTA file or open pysam.FastaFile. chrom (str): name of chromosome. start (int): start of sequence to extract (1-based index). end (int): end of sequence to extract (1-based index). Returns: str: a string of the reference sequence. Raises: ValueError: If desired interval not contained on chromsome. ''' # Check arguments toolbox.check_var(chrom, 'str') toolbox.check_var(start, 'int', mn=1) toolbox.check_var(end, 'int', mn=start) # Open FASTA if string supplied if isinstance(fasta, str): fasta = pysam.FastaFile(fasta) # Extract chromosome length and check end value chromLength = fasta.get_reference_length(chrom) if end > chromLength: raise ValueError('Interval extends beyond chromosome') # Extract and return sequence seq = fasta.fetch(chrom, start - 1, end) return (seq)
def extractPositionComplete(openBam, chrom, position, groupdel=False): ''' Function to extract all mapped base information from chromosomal position ''' # check arguments toolbox.check_var(chrom, 'str') toolbox.check_var(position, 'int', mn=1) toolbox.check_var(groupdel, 'bool') # Set variables for mapping baseCounts = {} # Loop through reads covering BAM for read in openBam.fetch(chrom, position - 1, position): # Skip unmapped reads if read.is_unmapped: continue # Extract base calls for read baseDict = baseCalls(read, groupdel) # Extract reads for base of interest or skip try: base, baseQ = baseDict[position - 1] except KeyError: continue # Extract strand and mapping quality mapQ = read.mapping_quality strand = '-' if read.is_reverse else '+' # Add base to base dictionary if base in baseCounts: baseCounts[base].append((baseQ, mapQ, strand)) else: baseCounts[base] = [(baseQ, mapQ, strand)] # Return data return (baseCounts)
def filterVarscan( inFile, outFile, filterFile = None, minCovNormal = 10, minCovTumour = 10, minFreqTumour = 0.05, maxFreqNormal = 1, minVarTumour = 2, maxPvalue = 0.05, somatic = True, flank = 25, maxNeighbour = 0 ): # Create counter logData =collections.OrderedDict([ ('Total', 0), ('Somatic status', 0), ('P-value', 0), ('Tumour coverage', 0), ('Tumour frequency', 0), ('Tumour count', 0), ('Normal coverage', 0), ('Normal frequency', 0), ('Neighbours', 0), ('Passed filters', 0) ]) # Check variables toolbox.check_var(inFile, 'file') toolbox.check_var(filterFile, 'file') toolbox.check_var(minCovNormal, 'int', mn = 1) toolbox.check_var(minCovTumour, 'int', mn = 1) toolbox.check_var(minFreqTumour, 'num', gt = 0, mx = 1) toolbox.check_var(maxFreqNormal, 'num', mn = 0, mx = 1) toolbox.check_var(minVarTumour, 'int', mn = 1) toolbox.check_var(maxPvalue, 'num', gt = 0) toolbox.check_var(somatic, 'bool') toolbox.check_var(flank, 'int', mn = 0) toolbox.check_var(maxNeighbour, 'int', mn = 0) # Create dictionary to store variant positions varPos = {} # Extract coordinates for neighbour filtering for varFile in [inFile, filterFile]: if varFile is None: continue with open(varFile) as varIn: header = varIn.next() for line in varIn: chrom, pos = line.split('\t')[:2] if chrom in varPos: varPos[chrom].append(int(pos)) else: varPos[chrom] = [int(pos)] # Sort data for key in varPos: varPos[key].sort() # Open input and output files with open(inFile) as varin: with open(outFile, 'w') as varout: # Write header varout.write(varin.next()) # Loop through input for line in varin: # Count and extract data logData['Total'] += 1 varData = line.split('\t') # Check somatic status and p-value status = str(varData[12]) pValue = float(varData[14]) if somatic and status != 'Somatic': logData['Somatic status'] += 1 continue if pValue > maxPvalue: logData['P-value'] += 1 continue # Check coverage and frequency covNormal = int(varData[4]) + int(varData[5]) freqNormal = int(varData[5]) / float(covNormal) covTumour = int(varData[8]) + int(varData[9]) freqTumour = int(varData[9]) / float(covTumour) varTumour = int(varData[9]) if covTumour < minCovTumour: logData['Tumour coverage'] += 1 continue if freqTumour < minFreqTumour: logData['Tumour frequency'] += 1 continue if varTumour < minVarTumour: logData['Tumour count'] += 1 continue if covNormal < minCovNormal: logData['Normal coverage'] += 1 continue if freqNormal > maxFreqNormal: logData['Normal frequency'] += 1 continue # Check flanking mutations chrom = varData[0] start = int(varData[1]) - flank end = int(varData[1]) + flank startIndex = bisect.bisect_left(varPos[chrom], start) endIndex = bisect.bisect_right(varPos[chrom], end, lo = startIndex) neighbourCount = (endIndex - startIndex) - 1 if neighbourCount > maxNeighbour: logData['Neighbours'] += 1 continue # Write output line logData['Passed filters'] += 1 varout.write(line) # Return log return(logData)
def filterSomatic( inFile, outFile, minCov = 10, minReads = 2, minStrands = 1, minAvgQ = 10, minVarFreq = 0.1, pValue = 0.05, indelFile = None, javaPath = 'java', varscanPath = 'varscan.jar' ): ''' 1) minCov - Minimum read depth. 2) minReads - Minimum supporting reads for a variant. 3) minStrands - Minimum number of strands on which variant observed. 4) minAvgQ - Minimum average base quality for variant-supporting reads. 5) minVarFreq - Minimum variant allele frequency threshold. 6) pValue - Default p-value threshold for calling variants. 7) indelFile - File of indels for filtering nearby SNPs. 8) outFile - Output file for filtered variants. ''' # Check numerical arguments toolbox.check_var(minCov, 'int', mn = 1) toolbox.check_var(minReads, 'int', mn = 1) toolbox.check_var(minStrands, 'int', mn = 1, mx = 2) toolbox.check_var(minAvgQ, 'int', mn = 2) toolbox.check_var(minVarFreq, 'num', gt = 0, mx = 1) toolbox.check_var(pValue, 'num', gt = 0, mx = 1) # Create command command = [javaPath, '-jar', varscanPath, 'somaticFilter', inFile, '--min-coverage', str(minCov), '--min-reads2', str(minReads), '--min-strands2', str(minStrands), '--min-avg-qual', str(minAvgQ), '--min-var-freq', str(minVarFreq), '--p-value', str(pValue), '--output-file', outFile] # Append indel file if supplied if indelFile: command.extend(['--indel-file', indelFile]) # Return command command = ' '.join(command) return command
def copynumber( mpileup1, mpileup2, outPrefix, minBaseQ = 20, minMapQ = 20, minCov = 20, minSegSize = 10, maxSegSize = 100, pValue = 0.01, dataRatio = None, javaPath = 'java', varscanPath = 'varscan.jar' ): ''' Function to generate command to perform copynumber calling using the varscan program. Function takes the following X arguments: 1) pileup - The normal-tumour pileup. 2) outPrefix - The prefix of the output files. 3) minBaseQ - Minimum base quality for coverage. 4) minMapQ - Minimum read mapping quality for coverage. 5) minCov - Minimum coverage for copynumber segments. 6) minSegSize - Minimum segment size. 7) maxSegSize - Maximum segment size. 8) pValue - P-value for significant copynumber change-point. 9) dataRatio - The normal/tumor input data ratio. ''' # Check commands toolbox.check_var(minBaseQ, 'int', gt = 0) toolbox.check_var(minMapQ, 'int', gt = 0) toolbox.check_var(minCov, 'int', gt = 0) toolbox.check_var(minSegSize, 'int', gt = 0) toolbox.check_var(maxSegSize, 'int', mn = minSegSize) toolbox.check_var(pValue, 'num', mn = 0, mx = 1) toolbox.check_var(dataRatio, 'num', mn = 0.01, mx = 100) # Create command to calculate depth if required if dataRatio is None: ratioCommand = 'R=$(%s) && echo "Ratio: $R"' %( calcRatio(mpileup1, mpileup2)) dataRatio = '$R' else: ratioCommand = '' # Create copy number command copyCommand = [ javaPath, '-jar', varscanPath, 'copynumber', mpileup1, mpileup2, outPrefix, '--min-base-qual', str(minBaseQ), '--min-map-qual', str(minMapQ), '--min-coverage', str(minCov), '--min-segment-size', str(minSegSize), '--max-segment-size', str(maxSegSize), '--p-value', str(pValue), '--data-ratio', str(dataRatio) ] # Combine and return commands if ratioCommand: return('%s && %s' %(ratioCommand, ' '.join(copyCommand))) else: return(' '.join(copyCommand))
def somatic( mpileup1, mpileup2, outPrefix, purity = 0.5, minCovNormal = 8, minCovTumour = 6, minHetFreq = 0.1, minHomFreq = 0.75, normalPurity = 1.0, tumourPurity = 0.5, pValueHet = 0.99, pValueSomatic = 0.05, strandFilter = False, javaPath = 'java', varscanPath = 'varscan.jar' ): # Check commands toolbox.check_var(purity, 'num', gt = 0, mx = 1) toolbox.check_var(minCovNormal, 'int', gt = 0) toolbox.check_var(minCovTumour, 'int', gt = 0) toolbox.check_var(minHetFreq, 'num', gt = 0, lt = 1) toolbox.check_var(minHomFreq, 'num', gt = 0, mx = 1) toolbox.check_var(normalPurity, 'num', gt = 0, mx = 1) toolbox.check_var(normalPurity, 'num', gt = 0, mx = 1) toolbox.check_var(pValueHet, 'num', mn = 0, mx = 1) toolbox.check_var(pValueSomatic, 'num', mn = 0, mx = 1) toolbox.check_var(strandFilter, 'bool') toolbox.check_var(javaPath, 'file') toolbox.check_var(varscanPath, 'file') # Create command command = [ javaPath, '-jar', varscanPath, 'somatic', mpileup1, mpileup2, outPrefix, '--min-coverage-normal', str(minCovNormal), '--min-coverage-tumor', str(minCovTumour), '--min-var-freq', str(minHetFreq), '--min-freq-for-hom', str(minHomFreq), '--normal-purity', str(normalPurity), '--tumor-purity', str(tumourPurity), '--p-value', str(pValueHet), '--somatic-p-value', str(pValueSomatic) ] if strandFilter: command.extend(['--strand-filter', '1']) # Join and return command command = ' '.join(command) return(command)
def calculateVariantMetrics(variantList, bamList, sampleNames, annovarPath, buildver, database, tempprefix, minMapQ=20, minBaseQ=20, groupdel=False, altQualNormal=None, h**o=True, complexity=True, fasta=None): ''' Function calculates metrics for variants across multiple samples Args: variantList (list): A list of four element tuples that list the chrom position, reference and variant. bamList (list): A list of BAM files from which to extract variant annotation. sampleNames (list): A list of sample names for each of the BAM files. annovarPath (str): Path to annovar executable buildver (str): Genome build to use for annotation. database (str): Database to use for annotation. h**o (bool): Whether to annotate indels for overlapping homopolymers. complexity (bool): Whether to annotate variants for complexity using soft-masking in FASTA file. fasta (str): Full path to FASTA file. Required for hompolymer annotation. ''' # Check arguments toolbox.check_var(minMapQ, 'int', mn=0) toolbox.check_var(minBaseQ, 'int', mn=0) toolbox.check_var(groupdel, 'bool') toolbox.check_var(altQualNormal, 'int', mn=2) # Check supplied Names if not isinstance(sampleNames, (list, tuple)): raise IOError('sampleNasmes must be a list or a tuple') if len(sampleNames) != len(bamList): raise IOError('Must be a sample name for each BAM') # Create output dataframe varnames = [':'.join(map(str, x)) for x in variantList] outputData = pd.DataFrame(index=varnames, columns=['chr', 'pos', 'ref', 'var', 'minp']) for x in varnames: chrom, position, ref, var = x.split(':') outputData.loc[x] = [chrom, int(position), ref, var, 1] # Add homopolymer annotation if h**o: homoData = homo_annotate(fasta, variantList, flank=100) outputData = pd.concat([outputData, homoData], axis=1) if complexity: compData = comp_annotate(fasta, variantList) outputData = pd.concat([outputData, compData], axis=1) # Create variables to store pipe and process data processDict = {} # Create process for each BAM for number, (name, bamFile) in enumerate(zip(sampleNames, bamList)): # Process reference sample if number == 0: # Create pipes and process pipeRecv, pipeSend = multiprocessing.Pipe(False) process = multiprocessing.Process( target=extractVariantCountsProcess, args=(variantList, bamFile, pipeSend, minMapQ, minBaseQ, groupdel)) process.start() pipeSend.close() # Store data processDict['reference'] = (name, pipeRecv, process) # Add extra process for reduced base quality in normal if altQualNormal: # Create pipes and process pipeRecv, pipeSend = multiprocessing.Pipe(False) process = multiprocessing.Process( target=extractVariantCountsProcess, args=(variantList, bamFile, pipeSend, minMapQ, altQualNormal, groupdel)) process.start() pipeSend.close() # Store data processDict['altref'] = (name, pipeRecv, process) # Process non-reference samples else: # Create pipes and process pipeRecv, pipeSend = multiprocessing.Pipe(False) process = multiprocessing.Process( target=extractVariantCountsProcess, args=(variantList, bamFile, pipeSend, minMapQ, minBaseQ, groupdel)) process.start() pipeSend.close() # Store data processDict[number] = (name, pipeRecv, process) # Extract data for reference name, pipe, process = processDict.pop('reference') sampleData = pipe.recv() pipe.close() process.join() # Add data for reference to output outputData[name + '_ref'] = sampleData['refcount'] outputData[name + '_var'] = sampleData['varcount'] outputData[name + '_freq'] = sampleData['varcount'] / (sampleData['refcount'] + sampleData['varcount']) outputData[name + '_mapq'] = sampleData['mapqual'] # Store data for reference normRef = outputData[name + '_ref'] normVar = outputData[name + '_var'] normFreq = outputData[name + '_freq'] # Extract data for alternative frequence if 'altref' in processDict: # Extract data for alternative frequency name, pipe, process = processDict.pop('altref') sampleData = pipe.recv() pipe.close() process.join() # Add data for alternative frequency to output outputData[name + '_altfreq'] = sampleData['varcount'] / ( sampleData['refcount'] + sampleData['varcount']) # Extract variants for each BAM for key in processDict: # Extract data for reference name, pipe, process = processDict[key] sampleData = pipe.recv() pipe.close() process.join() # Add data to output outputData[name + '_ref'] = sampleData['refcount'] outputData[name + '_var'] = sampleData['varcount'] outputData[name + '_freq'] = sampleData['varcount'] / ( sampleData['refcount'] + sampleData['varcount']) outputData[name + '_mapq'] = sampleData['mapqual'] # Calculate pvalue pvalue = [] for freq, normal, sample in zip( zip(normFreq, outputData[name + '_freq']), zip(normRef, normVar), zip(outputData[name + '_ref'], outputData[name + '_var'])): if freq[0] < freq[1]: pvalue.append( fisher_exact([normal, sample], alternative='greater')[1]) elif freq[0] > freq[1]: pvalue.append( fisher_exact([normal, sample], alternative='less')[1]) else: pvalue.append( fisher_exact([normal, sample], alternative='two-sided')[1]) # Rename tables columns and append to output outputData[name + '_pvalue'] = pvalue # Calculate minium pvalue and sort pvalueIndex = [x.endswith('pvalue') for x in outputData.columns] minp = outputData.loc[:, pvalueIndex].min(1) outputData['minp'] = minp # Add annovar annotation and concat to dataframe geneAnno = annovar.geneAnno2DF(variantList=variantList, path=annovarPath, buildver=buildver, database=database, tempprefix=tempprefix) outputData = pd.concat([outputData, geneAnno], axis=1) outputData.sort_values('minp', inplace=True) # Return data return (outputData)
def extractVariantCountsProcess(variantList, bamFile, pipe, minMapQ=20, minBaseQ=20, groupdel=False): ''' Function calculates the frequency at which specified nucleotides are found at specific chromosomal regions. Function takes 6 arguments: 1) variantList - a tuple/list of tuples/lists that contain four elements; the chromosome, position, reference, and variant e.g [('chr1', 1, 'A', 'T'), ('chr2', 100, 'C', 'G')]. 2) bamFile - Full path to BAM file 3) minMapQ - Minimum mapping quality of read to extract base. 4) MinBaseQ - Minimum base quality to extract base. Function returns a pandas dataframe containing the following five columns: 1) varcount - Count of the variant calls. 2) refcount - Count of the reference calls. 3) varfor - Frequency of reference reads on forward strand. 4) reffor - Frequency of variant reads on forward strand. 5) mapqual - Mean mapping score of ALL reads spanning the postion. ''' # check arguments toolbox.check_var(minMapQ, 'int', mn=0) toolbox.check_var(minBaseQ, 'int', mn=0) toolbox.check_var(groupdel, 'bool') # Create output dataframe variantNames = [':'.join(map(str, x)) for x in variantList] outData = pd.DataFrame( columns=['refcount', 'reffor', 'varcount', 'varfor', 'mapqual'], index=variantNames) # Open bamFile bam = pysam.AlignmentFile(bamFile) # Loop through variants and extract counts for name, (chrom, position, reference, variant) in zip(variantNames, variantList): # Extract base data baseCounts, mapqual = extractPosition(openBam=bam, chrom=chrom, position=position, minMapQ=minMapQ, minBaseQ=minBaseQ, groupdel=groupdel) # Extract reference counts if reference in baseCounts: refcount, reffor = baseCounts[reference] else: refcount = 0 reffor = 0 # Extract variant frequency if variant in baseCounts: varcount, varfor = baseCounts[variant] else: varcount = 0 varfor = 0 # Add data to output list outData.loc[name] = [refcount, reffor, varcount, varfor, mapqual] # Close bam bam.close() # Send data down pipe and close pipe.send(outData) pipe.close()
--threads=<threads> Number of threads [default: 4] --singleend Only single-end sequencing performed --genomebam Generate genome bam --markdup Mark duplicates on genome bam --help Output this message """ # Import required modules import os from ngs_python.fastq import fastqFind, fastqAlign from general_python import docopt, toolbox, moab # Extract and process arguments args = docopt.docopt(__doc__, version='v1') args['--threads'] = int(args['--threads']) args['--forprob'] = float(args['--forprob']) toolbox.check_var(args['--forprob'], 'num', mn=0, mx=1) inDir, inPrefix = os.path.split(args['<inprefix>']) outDir = os.path.join(args['<outdir>'], args['<samplename>']) if not os.path.isdir(outDir): os.mkdir(outDir) outPrefix = os.path.join(outDir, args['<samplename>']) outLog = outPrefix + '.rsem.log' # Create job dictionary # Extract fastq files and generate output file names read1, read2 = fastqFind.findFastq(prefix=inPrefix, dirList=[inDir], pair=True, gzip=True) rsemCommand = fastqAlign.rsemBowtie2Align(index=args['<index>'], outPrefix=outPrefix,
# Import custom modules from ngs_python.fastq import fastqTrim, fastqQC, fastqAlign, fastqFind from ngs_python.bam import samtools, picard, bamQC from general_python import moab, docopt, toolbox # Print command print '%s\n' % (' '.join(sys.argv)) ############################################################################### ## Process command line arguments and create output directories ############################################################################### # Extract arguments args = docopt.docopt(__doc__, version='v1') # Extract sample prefix and name = args['<sampledata>'].split(',') args['prefix'], args['name'] = args['<sampledata>'].split(',') # Check supplied files toolbox.check_var(args['<gtf>'], 'file') toolbox.check_var(args['<rrna>'], 'file') # Extract fastq files and check if args['--singleend']: args['read1'] = fastqFind.findFastq(prefix=args['prefix'], dirList=args['<indir>'].split(','), pair=False) else: args['read1'], args['read2'] = fastqFind.findFastq( prefix=args['prefix'], dirList=args['<indir>'].split(','), pair=True) if len(args['read1']) != len(args['read2']): raise IOError('Unequal number of FASTQ files identified') if len(args['read1']) < 1: raise IOError('Insufficient number of FASTQ files identified') # Convert numerical arguments args['--threads'] = int(args['--threads'])
""" # Import required modules import os import re import numpy as np from ngs_python.structure import interactionMatrix from general_python import docopt, toolbox # Extract arguments args = docopt.docopt(__doc__,version = 'v1') # Check numerical arguments args['--threads'] = int(args['--threads']) if args['nobed']: args['<binsize>'] = int(args['<binsize>']) # Check input files toolbox.check_var(args['<infile>'], 'file') if args['bed']: toolbox.check_var(args['<bedfile>'], 'file') else: toolbox.check_var(args['<chrfile>'], 'file') # Extract and print parameters to create bins if args['bed']: binData = args['<bedfile>'] print '\nParameters:\n %s\n' %( 'bed file provided', ) else: binData = (args['<chrfile>'], args['<binsize>'], args['--equal']) print '\nParameters:\n %s\n %s\n' %( 'max bin size: %s' %(args['<binsize>']), 'bin size equal: %s' %(args['--equal'])
# Import custom modules from ngs_python.fastq import fastqTrim, fastqQC, fastqAlign, fastqFind from ngs_python.bam import samtools, picard, bamQC from general_python import moab, docopt, toolbox # Print command print '%s\n' %(' '.join(sys.argv)) ############################################################################### ## Process command line arguments and create output directories ############################################################################### # Extract arguments args = docopt.docopt(__doc__,version = 'v1') # Extract sample prefix and name = args['<sampledata>'].split(',') args['prefix'], args['name'] = args['<sampledata>'].split(',') # Check supplied files toolbox.check_var(args['<gtf>'], 'file') toolbox.check_var(args['<rrna>'], 'file') # Extract fastq files and check if args['--singleend']: args['read1'] = fastqFind.findFastq( prefix = args['prefix'], dirList = args['<indir>'].split(','), pair = False ) else: args['read1'], args['read2'] = fastqFind.findFastq( prefix = args['prefix'], dirList = args['<indir>'].split(','), pair = True ) if len(args['read1']) != len(args['read2']):
def filterVarscan(inFile, outFile, filterFile=None, minCovNormal=10, minCovTumour=10, minFreqTumour=0.05, maxFreqNormal=1, minVarTumour=2, maxPvalue=0.05, somatic=True, flank=25, maxNeighbour=0): # Create counter logData = collections.OrderedDict([('Total', 0), ('Somatic status', 0), ('P-value', 0), ('Tumour coverage', 0), ('Tumour frequency', 0), ('Tumour count', 0), ('Normal coverage', 0), ('Normal frequency', 0), ('Neighbours', 0), ('Passed filters', 0)]) # Check variables toolbox.check_var(inFile, 'file') toolbox.check_var(filterFile, 'file') toolbox.check_var(minCovNormal, 'int', mn=1) toolbox.check_var(minCovTumour, 'int', mn=1) toolbox.check_var(minFreqTumour, 'num', gt=0, mx=1) toolbox.check_var(maxFreqNormal, 'num', mn=0, mx=1) toolbox.check_var(minVarTumour, 'int', mn=1) toolbox.check_var(maxPvalue, 'num', gt=0) toolbox.check_var(somatic, 'bool') toolbox.check_var(flank, 'int', mn=0) toolbox.check_var(maxNeighbour, 'int', mn=0) # Create dictionary to store variant positions varPos = {} # Extract coordinates for neighbour filtering for varFile in [inFile, filterFile]: if varFile is None: continue with open(varFile) as varIn: header = varIn.next() for line in varIn: chrom, pos = line.split('\t')[:2] if chrom in varPos: varPos[chrom].append(int(pos)) else: varPos[chrom] = [int(pos)] # Sort data for key in varPos: varPos[key].sort() # Open input and output files with open(inFile) as varin: with open(outFile, 'w') as varout: # Write header varout.write(varin.next()) # Loop through input for line in varin: # Count and extract data logData['Total'] += 1 varData = line.split('\t') # Check somatic status and p-value status = str(varData[12]) pValue = float(varData[14]) if somatic and status != 'Somatic': logData['Somatic status'] += 1 continue if pValue > maxPvalue: logData['P-value'] += 1 continue # Check coverage and frequency covNormal = int(varData[4]) + int(varData[5]) freqNormal = int(varData[5]) / float(covNormal) covTumour = int(varData[8]) + int(varData[9]) freqTumour = int(varData[9]) / float(covTumour) varTumour = int(varData[9]) if covTumour < minCovTumour: logData['Tumour coverage'] += 1 continue if freqTumour < minFreqTumour: logData['Tumour frequency'] += 1 continue if varTumour < minVarTumour: logData['Tumour count'] += 1 continue if covNormal < minCovNormal: logData['Normal coverage'] += 1 continue if freqNormal > maxFreqNormal: logData['Normal frequency'] += 1 continue # Check flanking mutations chrom = varData[0] start = int(varData[1]) - flank end = int(varData[1]) + flank startIndex = bisect.bisect_left(varPos[chrom], start) endIndex = bisect.bisect_right(varPos[chrom], end, lo=startIndex) neighbourCount = (endIndex - startIndex) - 1 if neighbourCount > maxNeighbour: logData['Neighbours'] += 1 continue # Write output line logData['Passed filters'] += 1 varout.write(line) # Return log return (logData)
def filterSomatic(inFile, outFile, minCov=10, minReads=2, minStrands=1, minAvgQ=10, minVarFreq=0.1, pValue=0.05, indelFile=None, javaPath='java', varscanPath='varscan.jar'): ''' 1) minCov - Minimum read depth. 2) minReads - Minimum supporting reads for a variant. 3) minStrands - Minimum number of strands on which variant observed. 4) minAvgQ - Minimum average base quality for variant-supporting reads. 5) minVarFreq - Minimum variant allele frequency threshold. 6) pValue - Default p-value threshold for calling variants. 7) indelFile - File of indels for filtering nearby SNPs. 8) outFile - Output file for filtered variants. ''' # Check numerical arguments toolbox.check_var(minCov, 'int', mn=1) toolbox.check_var(minReads, 'int', mn=1) toolbox.check_var(minStrands, 'int', mn=1, mx=2) toolbox.check_var(minAvgQ, 'int', mn=2) toolbox.check_var(minVarFreq, 'num', gt=0, mx=1) toolbox.check_var(pValue, 'num', gt=0, mx=1) # Create command command = [ javaPath, '-jar', varscanPath, 'somaticFilter', inFile, '--min-coverage', str(minCov), '--min-reads2', str(minReads), '--min-strands2', str(minStrands), '--min-avg-qual', str(minAvgQ), '--min-var-freq', str(minVarFreq), '--p-value', str(pValue), '--output-file', outFile ] # Append indel file if supplied if indelFile: command.extend(['--indel-file', indelFile]) # Return command command = ' '.join(command) return command
--threads=<threads> Number of threads [default: 4] --singleend Only single-end sequencing performed --genomebam Generate genome bam --markdup Mark duplicates on genome bam --help Output this message """ # Import required modules import os from ngs_python.fastq import fastqFind, fastqAlign from general_python import docopt, toolbox, moab # Extract and process arguments args = docopt.docopt(__doc__,version = 'v1') args['--threads'] = int(args['--threads']) args['--forprob'] = float(args['--forprob']) toolbox.check_var(args['--forprob'], 'num', mn = 0, mx = 1) inDir, inPrefix = os.path.split(args['<inprefix>']) outDir = os.path.join(args['<outdir>'], args['<samplename>']) if not os.path.isdir(outDir): os.mkdir(outDir) outPrefix = os.path.join(outDir, args['<samplename>']) outLog = outPrefix + '.rsem.log' # Create job dictionary # Extract fastq files and generate output file names read1, read2 = fastqFind.findFastq(prefix = inPrefix, dirList = [inDir], pair = True, gzip = True) rsemCommand = fastqAlign.rsemBowtie2Align(index = args['<index>'], outPrefix = outPrefix, read1 = read1, read2 = read2, rsemPath = args['--rsem'], bowtie2Path = args['--bowtie2'], threads = args['--threads'], forProb = args['--forprob'],