コード例 #1
0
 def submit(self, verbose = False):
     # Check arguments
     toolbox.check_var(verbose, 'bool')
     # Create moab list and dictionary
     moabList = []
     # Extract commands and parameters
     for commandNo in self.commandDict:
         # Unpack command and parameters
         command, processors, stdout, stderr, dependency = (
             self.commandDict[commandNo])
         # Create msub command and add node informaion
         msubCommand = ['/opt/moab/bin/msub', '-l']
         msubCommand.append('nodes=1:babs:ppn=%s' %(processors))
         # Add output information
         if stdout == stderr:
             msubCommand.extend(['-j', 'oe', '-o', stdout])
         else:
             msubCommand.extend(['-o', stdout, '-e', stderr])
         # Add dependecy
         dependList = []
         for d in dependency:
             dependList.append(moabList[d][0])
         if dependList:
             depend = 'x=depend:afterok:%s' %(':'.join(dependList))
             msubCommand.extend(['-W', depend])
         # Create output variable for function
         moabID = None
         # Try submit the job ten times
         for _ in range(10):
             # Create msub process
             msubProcess = subprocess.Popen(
                 msubCommand,
                 stdin = subprocess.PIPE,
                 stdout = subprocess.PIPE,
                 stderr = subprocess.PIPE
             )
             # Submit command
             moab = msubProcess.communicate(input=command)[0]
             # Search for returned Moab ID
             moabMatch = re.match('^\s+(Moab\\.\d+)\s+$',moab)
             # Check that Moab ID has been returned or wait and repeat
             if moabMatch:
                 moabID = moabMatch.group(1)
                 break
             else:
                 time.sleep(10)
         # Store succesfully commoted commands
         if moabID:
             moabList.append((moabID, command, processors, stdout, stderr,
                 ' '.join(dependList)))
         else:
             raise IOError('Could not submit moab job')
         # Print commands if requested
         if verbose:
             print 'JOB ID: %s\nCOMMAND: %s\nPROCESSORS: %s\nSTDOUT: %s\n'\
                 'STDERR: %s\nDEPENDENCIES: %s\n' %(moabID, command,
                 processors, stdout, stderr, ' '.join(dependList))
     # Save command to output file
     return(moabList)
コード例 #2
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def indel_homopolymer(sequence, position, ref, var, insertion):
    # Check that insertion argument is boolean
    toolbox.check_var(sequence, 'str')
    toolbox.check_var(position, 'int')
    toolbox.check_var(ref, 'str')
    toolbox.check_var(var, 'str')
    toolbox.check_var(insertion, 'bool')
    # Covert sequences to uppercase
    sequence = sequence.upper()
    ref = ref.upper()
    var = var.upper()
    # Check that first base of indel is centre of reference sequence
    if sequence[position] != ref:
        raise ValueError(
            'Position must be first base of indel (1-based index)')
    # Check that deletions are contained within the reference
    if not insertion and len(sequence) - position - len(var) < -1:
        raise ValueError('Deletion must be contained within reference')
    # Find homopolymer for insertion
    if insertion:
        # Extract insertion sequence and find any sub monomers
        indelSeq = var[1:]
        indelMonomer, monomerNo = toolbox.find_monomer(indelSeq)
        # Add insertion to reference
        inserted = sequence[:position] + indelMonomer + sequence[position:]
        # Create regex to find homopolymer
        regex = '^.{0,%s}?((%s)+).{0,%s}?$' % (position, indelMonomer,
                                               len(sequence) - position)
        # Find homopolymer and extract length
        h**o = re.match(regex, inserted).group(1)
        homoLengthRef = (len(h**o) / len(indelMonomer)) - 1
        homoLengthVar = homoLengthRef + monomerNo
    # Find homopoly
    else:
        # Extract deleted sequence
        indelSeq = sequence[position:][:len(var)]
        indelMonomer, monomerNo = toolbox.find_monomer(indelSeq)
        # Create regex to find homopolymer
        regex = '^.{0,%s}?((%s)+).{0,%s}?$' % (
            position, indelMonomer, len(sequence) - position - len(indelSeq))
        # Find homopolymer and extract length
        h**o = re.match(regex, sequence).group(1)
        homoLengthRef = len(h**o) / len(indelMonomer)
        homoLengthVar = homoLengthRef - monomerNo
    # Return data
    return (indelMonomer, homoLengthRef, homoLengthVar)
コード例 #3
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def extractPosition(openBam,
                    chrom,
                    position,
                    minMapQ=20,
                    minBaseQ=20,
                    groupdel=False):
    # check arguments
    toolbox.check_var(chrom, 'str')
    toolbox.check_var(position, 'int', mn=1)
    toolbox.check_var(minMapQ, 'int', mn=0)
    toolbox.check_var(minBaseQ, 'int', mn=0)
    toolbox.check_var(groupdel, 'bool')
    # Set variables for mapping
    mapQuality = []
    baseCounts = {}
    # Loop through reads covering BAM
    for read in openBam.fetch(chrom, position - 1, position):
        # Store mapping quality and skip reads with low values
        mapQuality.append(read.mapping_quality)
        if mapQuality[-1] < minMapQ:
            continue
        # Extract base calls for read
        baseDict = baseCalls(read, groupdel)
        # Extract reads for base of interest or skip
        try:
            base, quality = baseDict[position - 1]
        except KeyError:
            continue
        # Skip bases of poor quality
        if quality < minBaseQ:
            continue
        # Add base to base dictionary
        if base in baseCounts:
            baseCounts[base][0] += 1
        else:
            baseCounts[base] = [1, 0]
        # Add forward strand count to base dictionary
        if not read.is_reverse:
            baseCounts[base][1] += 1
    # Calculate and return results
    if len(mapQuality) > 0:
        meanMap = sum(mapQuality) / len(mapQuality)
    else:
        meanMap = 0
    return (baseCounts, meanMap)
コード例 #4
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def extract_fasta(fasta, chrom, start, end):
    ''' Function to extract sequence from FASTA file using the pysam
    module.
    
    Args:
        fasta: string of full path to faidx indexed FASTA file or open
            pysam.FastaFile.
        chrom (str): name of chromosome.
        start (int): start of sequence to extract (1-based index).
        end (int): end of sequence to extract (1-based index).
        
    Returns:
        str: a string of the reference sequence.
    
    Raises:
        ValueError: If desired interval not contained on chromsome.
    
    '''
    # Check arguments
    toolbox.check_var(chrom, 'str')
    toolbox.check_var(start, 'int', mn=1)
    toolbox.check_var(end, 'int', mn=start)
    # Open FASTA if string supplied
    if isinstance(fasta, str):
        fasta = pysam.FastaFile(fasta)
    # Extract chromosome length and check end value
    chromLength = fasta.get_reference_length(chrom)
    if end > chromLength:
        raise ValueError('Interval extends beyond chromosome')
    # Extract and return sequence
    seq = fasta.fetch(chrom, start - 1, end)
    return (seq)
コード例 #5
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def extractPositionComplete(openBam, chrom, position, groupdel=False):
    '''
    Function to extract all mapped base information from
    chromosomal position
    '''
    # check arguments
    toolbox.check_var(chrom, 'str')
    toolbox.check_var(position, 'int', mn=1)
    toolbox.check_var(groupdel, 'bool')
    # Set variables for mapping
    baseCounts = {}
    # Loop through reads covering BAM
    for read in openBam.fetch(chrom, position - 1, position):
        # Skip unmapped reads
        if read.is_unmapped:
            continue
        # Extract base calls for read
        baseDict = baseCalls(read, groupdel)
        # Extract reads for base of interest or skip
        try:
            base, baseQ = baseDict[position - 1]
        except KeyError:
            continue
        # Extract strand and mapping quality
        mapQ = read.mapping_quality
        strand = '-' if read.is_reverse else '+'
        # Add base to base dictionary
        if base in baseCounts:
            baseCounts[base].append((baseQ, mapQ, strand))
        else:
            baseCounts[base] = [(baseQ, mapQ, strand)]
    # Return data
    return (baseCounts)
コード例 #6
0
ファイル: varscan.py プロジェクト: adam-rabinowitz/ngs_python
def filterVarscan(
        inFile, outFile, filterFile = None,  minCovNormal = 10,
        minCovTumour = 10, minFreqTumour = 0.05, maxFreqNormal = 1,
        minVarTumour = 2, maxPvalue = 0.05, somatic = True, flank = 25,
        maxNeighbour = 0
    ):
    # Create counter
    logData =collections.OrderedDict([
        ('Total', 0),
        ('Somatic status', 0),
        ('P-value', 0),
        ('Tumour coverage', 0),
        ('Tumour frequency', 0),
        ('Tumour count', 0),
        ('Normal coverage', 0),
        ('Normal frequency', 0),
        ('Neighbours', 0),
        ('Passed filters', 0)
    ])
    # Check variables
    toolbox.check_var(inFile, 'file')
    toolbox.check_var(filterFile, 'file')
    toolbox.check_var(minCovNormal, 'int', mn = 1)
    toolbox.check_var(minCovTumour, 'int', mn = 1)
    toolbox.check_var(minFreqTumour, 'num', gt = 0, mx = 1)
    toolbox.check_var(maxFreqNormal, 'num', mn = 0, mx = 1)
    toolbox.check_var(minVarTumour, 'int', mn = 1)
    toolbox.check_var(maxPvalue, 'num', gt = 0)
    toolbox.check_var(somatic, 'bool')
    toolbox.check_var(flank, 'int', mn = 0)
    toolbox.check_var(maxNeighbour, 'int', mn = 0)
    # Create dictionary to store variant positions
    varPos = {}
    # Extract coordinates for neighbour filtering
    for varFile in [inFile, filterFile]:
        if varFile is None:
            continue
        with open(varFile) as varIn:
            header = varIn.next()
            for line in varIn:
                chrom, pos = line.split('\t')[:2]
                if chrom in varPos:
                    varPos[chrom].append(int(pos))
                else:
                    varPos[chrom] = [int(pos)]
    # Sort data
    for key in varPos:
        varPos[key].sort()
    # Open input and output files
    with open(inFile) as varin:
        with open(outFile, 'w') as varout:
            # Write header
            varout.write(varin.next())
            # Loop through input
            for line in varin:
                # Count and extract data
                logData['Total'] += 1
                varData = line.split('\t')
                # Check somatic status and p-value
                status = str(varData[12])
                pValue = float(varData[14])
                if somatic and status != 'Somatic':
                    logData['Somatic status'] += 1
                    continue
                if pValue > maxPvalue:
                    logData['P-value'] += 1
                    continue
                # Check coverage and frequency
                covNormal = int(varData[4]) + int(varData[5])
                freqNormal = int(varData[5]) / float(covNormal)
                covTumour = int(varData[8]) + int(varData[9])
                freqTumour = int(varData[9]) / float(covTumour)
                varTumour = int(varData[9])
                if covTumour < minCovTumour:
                    logData['Tumour coverage'] += 1
                    continue
                if freqTumour < minFreqTumour:
                    logData['Tumour frequency'] += 1
                    continue
                if varTumour < minVarTumour:
                    logData['Tumour count'] += 1
                    continue
                if covNormal < minCovNormal:
                    logData['Normal coverage'] += 1
                    continue
                if freqNormal > maxFreqNormal:
                    logData['Normal frequency'] += 1
                    continue
                # Check flanking mutations
                chrom = varData[0]
                start = int(varData[1]) - flank
                end = int(varData[1]) + flank
                startIndex = bisect.bisect_left(varPos[chrom], start)
                endIndex = bisect.bisect_right(varPos[chrom], end, lo = startIndex)
                neighbourCount = (endIndex - startIndex) - 1
                if neighbourCount > maxNeighbour:
                    logData['Neighbours'] += 1
                    continue
                # Write output line
                logData['Passed filters'] += 1
                varout.write(line)
    # Return log
    return(logData)
コード例 #7
0
ファイル: varscan.py プロジェクト: adam-rabinowitz/ngs_python
def filterSomatic(
        inFile, outFile, minCov = 10, minReads = 2, minStrands = 1,
        minAvgQ = 10, minVarFreq = 0.1, pValue = 0.05, indelFile = None,
        javaPath = 'java', varscanPath = 'varscan.jar'
    ):
    '''
    1)  minCov - Minimum read depth.
    2)  minReads - Minimum supporting reads for a variant.
    3)  minStrands - Minimum number of strands on which variant observed.
    4)  minAvgQ - Minimum average base quality for variant-supporting reads.
    5)  minVarFreq - Minimum variant allele frequency threshold.
    6)  pValue - Default p-value threshold for calling variants.
    7)  indelFile - File of indels for filtering nearby SNPs.
    8)  outFile - Output file for filtered variants.
    
    '''
    # Check numerical arguments
    toolbox.check_var(minCov, 'int', mn = 1)
    toolbox.check_var(minReads, 'int', mn = 1)
    toolbox.check_var(minStrands, 'int', mn = 1, mx = 2)
    toolbox.check_var(minAvgQ, 'int', mn = 2)
    toolbox.check_var(minVarFreq, 'num', gt = 0, mx = 1)
    toolbox.check_var(pValue, 'num', gt = 0, mx = 1)
    # Create command
    command = [javaPath, '-jar', varscanPath, 'somaticFilter', inFile,
        '--min-coverage', str(minCov), '--min-reads2', str(minReads),
        '--min-strands2', str(minStrands), '--min-avg-qual', str(minAvgQ),
        '--min-var-freq', str(minVarFreq), '--p-value', str(pValue),
        '--output-file', outFile]
    # Append indel file if supplied
    if indelFile:
        command.extend(['--indel-file', indelFile])
    # Return command
    command = ' '.join(command)
    return command
コード例 #8
0
ファイル: varscan.py プロジェクト: adam-rabinowitz/ngs_python
def copynumber(
        mpileup1, mpileup2, outPrefix, minBaseQ = 20, minMapQ = 20,
        minCov = 20, minSegSize = 10, maxSegSize = 100, pValue = 0.01,
        dataRatio = None, javaPath = 'java', varscanPath = 'varscan.jar'
    ):
    ''' Function to generate command to perform copynumber calling using
    the varscan program. Function takes the following X arguments:
    
    1)  pileup - The normal-tumour pileup.
    2)  outPrefix - The prefix of the output files.
    3)  minBaseQ - Minimum base quality for coverage.
    4)  minMapQ - Minimum read mapping quality for coverage.
    5)  minCov - Minimum coverage for copynumber segments.
    6)  minSegSize - Minimum segment size.
    7)  maxSegSize - Maximum segment size.
    8)  pValue - P-value for significant copynumber change-point.
    9)  dataRatio - The normal/tumor input data ratio.
    
    '''
    # Check commands
    toolbox.check_var(minBaseQ, 'int', gt = 0)
    toolbox.check_var(minMapQ, 'int', gt = 0)
    toolbox.check_var(minCov, 'int', gt = 0)
    toolbox.check_var(minSegSize, 'int', gt = 0)
    toolbox.check_var(maxSegSize, 'int', mn = minSegSize)
    toolbox.check_var(pValue, 'num', mn = 0, mx = 1)
    toolbox.check_var(dataRatio, 'num', mn = 0.01, mx = 100)
    # Create command to calculate depth if required
    if dataRatio is None:
        ratioCommand = 'R=$(%s) && echo "Ratio: $R"' %(
            calcRatio(mpileup1, mpileup2))
        dataRatio = '$R'
    else:
        ratioCommand = ''
    # Create copy number command
    copyCommand = [
        javaPath, '-jar', varscanPath, 'copynumber', mpileup1, mpileup2,
        outPrefix, '--min-base-qual', str(minBaseQ), '--min-map-qual',
        str(minMapQ), '--min-coverage', str(minCov), '--min-segment-size',
        str(minSegSize), '--max-segment-size', str(maxSegSize), '--p-value',
        str(pValue), '--data-ratio', str(dataRatio)
    ]
    # Combine and return commands
    if ratioCommand:
        return('%s && %s' %(ratioCommand, ' '.join(copyCommand)))
    else:
        return(' '.join(copyCommand))
コード例 #9
0
ファイル: varscan.py プロジェクト: adam-rabinowitz/ngs_python
def somatic(
        mpileup1, mpileup2, outPrefix, purity = 0.5, minCovNormal = 8,
        minCovTumour = 6, minHetFreq = 0.1, minHomFreq = 0.75,
        normalPurity = 1.0, tumourPurity = 0.5, pValueHet = 0.99,
        pValueSomatic = 0.05, strandFilter = False, javaPath = 'java',
        varscanPath = 'varscan.jar'
    ):
    # Check commands
    toolbox.check_var(purity, 'num', gt = 0, mx = 1)
    toolbox.check_var(minCovNormal, 'int', gt = 0)
    toolbox.check_var(minCovTumour, 'int', gt = 0)
    toolbox.check_var(minHetFreq, 'num', gt = 0, lt = 1)
    toolbox.check_var(minHomFreq, 'num', gt = 0, mx = 1)
    toolbox.check_var(normalPurity, 'num', gt = 0, mx = 1)
    toolbox.check_var(normalPurity, 'num', gt = 0, mx = 1)
    toolbox.check_var(pValueHet, 'num', mn = 0, mx = 1)
    toolbox.check_var(pValueSomatic, 'num', mn = 0, mx = 1)
    toolbox.check_var(strandFilter, 'bool')
    toolbox.check_var(javaPath, 'file')
    toolbox.check_var(varscanPath, 'file')
    # Create command
    command = [
        javaPath, '-jar', varscanPath, 'somatic', mpileup1, mpileup2,
        outPrefix, '--min-coverage-normal', str(minCovNormal),
        '--min-coverage-tumor', str(minCovTumour), '--min-var-freq',
        str(minHetFreq), '--min-freq-for-hom', str(minHomFreq),
        '--normal-purity', str(normalPurity), '--tumor-purity',
        str(tumourPurity), '--p-value', str(pValueHet), '--somatic-p-value',
        str(pValueSomatic)
    ]
    if strandFilter:
        command.extend(['--strand-filter', '1'])
    # Join and return command
    command = ' '.join(command)
    return(command)
コード例 #10
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def calculateVariantMetrics(variantList,
                            bamList,
                            sampleNames,
                            annovarPath,
                            buildver,
                            database,
                            tempprefix,
                            minMapQ=20,
                            minBaseQ=20,
                            groupdel=False,
                            altQualNormal=None,
                            h**o=True,
                            complexity=True,
                            fasta=None):
    ''' Function calculates metrics for variants across multiple samples

    Args:
        variantList (list): A list of four element tuples that list the chrom
            position, reference and variant.
        bamList (list): A list of BAM files from which to extract variant
            annotation.
        sampleNames (list): A list of sample names for each of the BAM files.
        annovarPath (str): Path to annovar executable
        buildver (str): Genome build to use for annotation.
        database (str): Database to use for annotation.
        h**o (bool): Whether to annotate indels for overlapping homopolymers.
        complexity (bool): Whether to annotate variants for complexity using
            soft-masking in FASTA file.
        fasta (str): Full path to FASTA file. Required for  hompolymer
            annotation.
    
    '''
    # Check arguments
    toolbox.check_var(minMapQ, 'int', mn=0)
    toolbox.check_var(minBaseQ, 'int', mn=0)
    toolbox.check_var(groupdel, 'bool')
    toolbox.check_var(altQualNormal, 'int', mn=2)
    # Check supplied Names
    if not isinstance(sampleNames, (list, tuple)):
        raise IOError('sampleNasmes must be a list or a tuple')
    if len(sampleNames) != len(bamList):
        raise IOError('Must be a sample name for each BAM')
    # Create output dataframe
    varnames = [':'.join(map(str, x)) for x in variantList]
    outputData = pd.DataFrame(index=varnames,
                              columns=['chr', 'pos', 'ref', 'var', 'minp'])
    for x in varnames:
        chrom, position, ref, var = x.split(':')
        outputData.loc[x] = [chrom, int(position), ref, var, 1]
    # Add homopolymer annotation
    if h**o:
        homoData = homo_annotate(fasta, variantList, flank=100)
        outputData = pd.concat([outputData, homoData], axis=1)
    if complexity:
        compData = comp_annotate(fasta, variantList)
        outputData = pd.concat([outputData, compData], axis=1)
    # Create variables to store pipe and process data
    processDict = {}
    # Create process for each BAM
    for number, (name, bamFile) in enumerate(zip(sampleNames, bamList)):
        # Process reference sample
        if number == 0:
            # Create pipes and process
            pipeRecv, pipeSend = multiprocessing.Pipe(False)
            process = multiprocessing.Process(
                target=extractVariantCountsProcess,
                args=(variantList, bamFile, pipeSend, minMapQ, minBaseQ,
                      groupdel))
            process.start()
            pipeSend.close()
            # Store data
            processDict['reference'] = (name, pipeRecv, process)
            # Add extra process for reduced base quality in normal
            if altQualNormal:
                # Create pipes and process
                pipeRecv, pipeSend = multiprocessing.Pipe(False)
                process = multiprocessing.Process(
                    target=extractVariantCountsProcess,
                    args=(variantList, bamFile, pipeSend, minMapQ,
                          altQualNormal, groupdel))
                process.start()
                pipeSend.close()
                # Store data
                processDict['altref'] = (name, pipeRecv, process)
        # Process non-reference samples
        else:
            # Create pipes and process
            pipeRecv, pipeSend = multiprocessing.Pipe(False)
            process = multiprocessing.Process(
                target=extractVariantCountsProcess,
                args=(variantList, bamFile, pipeSend, minMapQ, minBaseQ,
                      groupdel))
            process.start()
            pipeSend.close()
            # Store data
            processDict[number] = (name, pipeRecv, process)
    # Extract data for reference
    name, pipe, process = processDict.pop('reference')
    sampleData = pipe.recv()
    pipe.close()
    process.join()
    # Add data for reference to output
    outputData[name + '_ref'] = sampleData['refcount']
    outputData[name + '_var'] = sampleData['varcount']
    outputData[name +
               '_freq'] = sampleData['varcount'] / (sampleData['refcount'] +
                                                    sampleData['varcount'])
    outputData[name + '_mapq'] = sampleData['mapqual']
    # Store data for reference
    normRef = outputData[name + '_ref']
    normVar = outputData[name + '_var']
    normFreq = outputData[name + '_freq']
    # Extract data for alternative frequence
    if 'altref' in processDict:
        # Extract data for alternative frequency
        name, pipe, process = processDict.pop('altref')
        sampleData = pipe.recv()
        pipe.close()
        process.join()
        # Add data for alternative frequency to output
        outputData[name + '_altfreq'] = sampleData['varcount'] / (
            sampleData['refcount'] + sampleData['varcount'])
    # Extract variants for each BAM
    for key in processDict:
        # Extract data for reference
        name, pipe, process = processDict[key]
        sampleData = pipe.recv()
        pipe.close()
        process.join()
        # Add data to output
        outputData[name + '_ref'] = sampleData['refcount']
        outputData[name + '_var'] = sampleData['varcount']
        outputData[name + '_freq'] = sampleData['varcount'] / (
            sampleData['refcount'] + sampleData['varcount'])
        outputData[name + '_mapq'] = sampleData['mapqual']
        # Calculate pvalue
        pvalue = []
        for freq, normal, sample in zip(
                zip(normFreq, outputData[name + '_freq']),
                zip(normRef, normVar),
                zip(outputData[name + '_ref'], outputData[name + '_var'])):
            if freq[0] < freq[1]:
                pvalue.append(
                    fisher_exact([normal, sample], alternative='greater')[1])
            elif freq[0] > freq[1]:
                pvalue.append(
                    fisher_exact([normal, sample], alternative='less')[1])
            else:
                pvalue.append(
                    fisher_exact([normal, sample], alternative='two-sided')[1])
        # Rename tables columns and append to output
        outputData[name + '_pvalue'] = pvalue
    # Calculate minium pvalue and sort
    pvalueIndex = [x.endswith('pvalue') for x in outputData.columns]
    minp = outputData.loc[:, pvalueIndex].min(1)
    outputData['minp'] = minp
    # Add annovar annotation and concat to dataframe
    geneAnno = annovar.geneAnno2DF(variantList=variantList,
                                   path=annovarPath,
                                   buildver=buildver,
                                   database=database,
                                   tempprefix=tempprefix)
    outputData = pd.concat([outputData, geneAnno], axis=1)
    outputData.sort_values('minp', inplace=True)
    # Return data
    return (outputData)
コード例 #11
0
ファイル: pysamfunc.py プロジェクト: hjanime/ngs_python
def extractVariantCountsProcess(variantList,
                                bamFile,
                                pipe,
                                minMapQ=20,
                                minBaseQ=20,
                                groupdel=False):
    ''' Function calculates the frequency at which specified nucleotides
    are found at specific chromosomal regions. Function takes 6 arguments:
    
    1)  variantList - a tuple/list of tuples/lists that contain four
        elements; the chromosome, position, reference, and variant e.g
        [('chr1', 1, 'A', 'T'), ('chr2', 100, 'C', 'G')].
    2)  bamFile - Full path to BAM file
    3)  minMapQ - Minimum mapping quality of read to extract base.
    4)  MinBaseQ - Minimum base quality to extract base.
    
    Function returns a pandas dataframe containing the following five columns:
    
    1)  varcount - Count of the variant calls.
    2)  refcount - Count of the reference calls.
    3)  varfor - Frequency of reference reads on forward strand.
    4)  reffor - Frequency of variant reads on forward strand.
    5)  mapqual - Mean mapping score of ALL reads spanning the postion.
    
    '''
    # check arguments
    toolbox.check_var(minMapQ, 'int', mn=0)
    toolbox.check_var(minBaseQ, 'int', mn=0)
    toolbox.check_var(groupdel, 'bool')
    # Create output dataframe
    variantNames = [':'.join(map(str, x)) for x in variantList]
    outData = pd.DataFrame(
        columns=['refcount', 'reffor', 'varcount', 'varfor', 'mapqual'],
        index=variantNames)
    # Open bamFile
    bam = pysam.AlignmentFile(bamFile)
    # Loop through variants and extract counts
    for name, (chrom, position, reference,
               variant) in zip(variantNames, variantList):
        # Extract base data
        baseCounts, mapqual = extractPosition(openBam=bam,
                                              chrom=chrom,
                                              position=position,
                                              minMapQ=minMapQ,
                                              minBaseQ=minBaseQ,
                                              groupdel=groupdel)
        # Extract reference counts
        if reference in baseCounts:
            refcount, reffor = baseCounts[reference]
        else:
            refcount = 0
            reffor = 0
        # Extract variant frequency
        if variant in baseCounts:
            varcount, varfor = baseCounts[variant]
        else:
            varcount = 0
            varfor = 0
        # Add data to output list
        outData.loc[name] = [refcount, reffor, varcount, varfor, mapqual]
    # Close bam
    bam.close()
    # Send data down pipe and close
    pipe.send(outData)
    pipe.close()
コード例 #12
0
ファイル: rsemAlign.py プロジェクト: hjanime/ngs_python
    --threads=<threads>  Number of threads [default: 4]
    --singleend          Only single-end sequencing performed
    --genomebam          Generate genome bam
    --markdup            Mark duplicates on genome bam
    --help               Output this message
    
"""
# Import required modules
import os
from ngs_python.fastq import fastqFind, fastqAlign
from general_python import docopt, toolbox, moab
# Extract and process arguments
args = docopt.docopt(__doc__, version='v1')
args['--threads'] = int(args['--threads'])
args['--forprob'] = float(args['--forprob'])
toolbox.check_var(args['--forprob'], 'num', mn=0, mx=1)
inDir, inPrefix = os.path.split(args['<inprefix>'])
outDir = os.path.join(args['<outdir>'], args['<samplename>'])
if not os.path.isdir(outDir):
    os.mkdir(outDir)
outPrefix = os.path.join(outDir, args['<samplename>'])
outLog = outPrefix + '.rsem.log'
# Create job dictionary

# Extract fastq files and generate output file names
read1, read2 = fastqFind.findFastq(prefix=inPrefix,
                                   dirList=[inDir],
                                   pair=True,
                                   gzip=True)
rsemCommand = fastqAlign.rsemBowtie2Align(index=args['<index>'],
                                          outPrefix=outPrefix,
コード例 #13
0
# Import custom modules
from ngs_python.fastq import fastqTrim, fastqQC, fastqAlign, fastqFind
from ngs_python.bam import samtools, picard, bamQC
from general_python import moab, docopt, toolbox
# Print command
print '%s\n' % (' '.join(sys.argv))

###############################################################################
## Process command line arguments and create output directories
###############################################################################
# Extract arguments
args = docopt.docopt(__doc__, version='v1')
# Extract sample prefix and name = args['<sampledata>'].split(',')
args['prefix'], args['name'] = args['<sampledata>'].split(',')
# Check supplied files
toolbox.check_var(args['<gtf>'], 'file')
toolbox.check_var(args['<rrna>'], 'file')
# Extract fastq files and check
if args['--singleend']:
    args['read1'] = fastqFind.findFastq(prefix=args['prefix'],
                                        dirList=args['<indir>'].split(','),
                                        pair=False)
else:
    args['read1'], args['read2'] = fastqFind.findFastq(
        prefix=args['prefix'], dirList=args['<indir>'].split(','), pair=True)
    if len(args['read1']) != len(args['read2']):
        raise IOError('Unequal number of FASTQ files identified')
if len(args['read1']) < 1:
    raise IOError('Insufficient number of FASTQ files identified')
# Convert numerical arguments
args['--threads'] = int(args['--threads'])
コード例 #14
0
    
"""
# Import required modules
import os
import re
import numpy as np
from ngs_python.structure import interactionMatrix
from general_python import docopt, toolbox
# Extract arguments
args = docopt.docopt(__doc__,version = 'v1')
# Check numerical arguments
args['--threads'] = int(args['--threads'])
if args['nobed']:
    args['<binsize>'] = int(args['<binsize>'])
# Check input files
toolbox.check_var(args['<infile>'], 'file')
if args['bed']:
    toolbox.check_var(args['<bedfile>'], 'file')
else:
    toolbox.check_var(args['<chrfile>'], 'file')
# Extract and print parameters to create bins
if args['bed']:
    binData = args['<bedfile>']
    print '\nParameters:\n  %s\n' %(
        'bed file provided',
    )
else:
    binData = (args['<chrfile>'], args['<binsize>'], args['--equal'])
    print '\nParameters:\n  %s\n  %s\n' %(
        'max bin size: %s' %(args['<binsize>']),
        'bin size equal: %s' %(args['--equal'])
コード例 #15
0
# Import custom modules
from ngs_python.fastq import fastqTrim, fastqQC, fastqAlign, fastqFind
from ngs_python.bam import samtools, picard, bamQC
from general_python import moab, docopt, toolbox
# Print command
print '%s\n' %(' '.join(sys.argv))

###############################################################################
## Process command line arguments and create output directories
###############################################################################
# Extract arguments
args = docopt.docopt(__doc__,version = 'v1')
# Extract sample prefix and name = args['<sampledata>'].split(',')
args['prefix'], args['name'] = args['<sampledata>'].split(',')
# Check supplied files
toolbox.check_var(args['<gtf>'], 'file')
toolbox.check_var(args['<rrna>'], 'file')
# Extract fastq files and check
if args['--singleend']:
    args['read1']  = fastqFind.findFastq(
        prefix = args['prefix'],
        dirList = args['<indir>'].split(','),
        pair = False
    )
else:
    args['read1'], args['read2'] = fastqFind.findFastq(
        prefix = args['prefix'],
        dirList = args['<indir>'].split(','),
        pair = True
    )
    if len(args['read1']) != len(args['read2']):
コード例 #16
0
ファイル: varscan.py プロジェクト: hjanime/ngs_python
def filterVarscan(inFile,
                  outFile,
                  filterFile=None,
                  minCovNormal=10,
                  minCovTumour=10,
                  minFreqTumour=0.05,
                  maxFreqNormal=1,
                  minVarTumour=2,
                  maxPvalue=0.05,
                  somatic=True,
                  flank=25,
                  maxNeighbour=0):
    # Create counter
    logData = collections.OrderedDict([('Total', 0), ('Somatic status', 0),
                                       ('P-value', 0), ('Tumour coverage', 0),
                                       ('Tumour frequency', 0),
                                       ('Tumour count', 0),
                                       ('Normal coverage', 0),
                                       ('Normal frequency', 0),
                                       ('Neighbours', 0),
                                       ('Passed filters', 0)])
    # Check variables
    toolbox.check_var(inFile, 'file')
    toolbox.check_var(filterFile, 'file')
    toolbox.check_var(minCovNormal, 'int', mn=1)
    toolbox.check_var(minCovTumour, 'int', mn=1)
    toolbox.check_var(minFreqTumour, 'num', gt=0, mx=1)
    toolbox.check_var(maxFreqNormal, 'num', mn=0, mx=1)
    toolbox.check_var(minVarTumour, 'int', mn=1)
    toolbox.check_var(maxPvalue, 'num', gt=0)
    toolbox.check_var(somatic, 'bool')
    toolbox.check_var(flank, 'int', mn=0)
    toolbox.check_var(maxNeighbour, 'int', mn=0)
    # Create dictionary to store variant positions
    varPos = {}
    # Extract coordinates for neighbour filtering
    for varFile in [inFile, filterFile]:
        if varFile is None:
            continue
        with open(varFile) as varIn:
            header = varIn.next()
            for line in varIn:
                chrom, pos = line.split('\t')[:2]
                if chrom in varPos:
                    varPos[chrom].append(int(pos))
                else:
                    varPos[chrom] = [int(pos)]
    # Sort data
    for key in varPos:
        varPos[key].sort()
    # Open input and output files
    with open(inFile) as varin:
        with open(outFile, 'w') as varout:
            # Write header
            varout.write(varin.next())
            # Loop through input
            for line in varin:
                # Count and extract data
                logData['Total'] += 1
                varData = line.split('\t')
                # Check somatic status and p-value
                status = str(varData[12])
                pValue = float(varData[14])
                if somatic and status != 'Somatic':
                    logData['Somatic status'] += 1
                    continue
                if pValue > maxPvalue:
                    logData['P-value'] += 1
                    continue
                # Check coverage and frequency
                covNormal = int(varData[4]) + int(varData[5])
                freqNormal = int(varData[5]) / float(covNormal)
                covTumour = int(varData[8]) + int(varData[9])
                freqTumour = int(varData[9]) / float(covTumour)
                varTumour = int(varData[9])
                if covTumour < minCovTumour:
                    logData['Tumour coverage'] += 1
                    continue
                if freqTumour < minFreqTumour:
                    logData['Tumour frequency'] += 1
                    continue
                if varTumour < minVarTumour:
                    logData['Tumour count'] += 1
                    continue
                if covNormal < minCovNormal:
                    logData['Normal coverage'] += 1
                    continue
                if freqNormal > maxFreqNormal:
                    logData['Normal frequency'] += 1
                    continue
                # Check flanking mutations
                chrom = varData[0]
                start = int(varData[1]) - flank
                end = int(varData[1]) + flank
                startIndex = bisect.bisect_left(varPos[chrom], start)
                endIndex = bisect.bisect_right(varPos[chrom],
                                               end,
                                               lo=startIndex)
                neighbourCount = (endIndex - startIndex) - 1
                if neighbourCount > maxNeighbour:
                    logData['Neighbours'] += 1
                    continue
                # Write output line
                logData['Passed filters'] += 1
                varout.write(line)
    # Return log
    return (logData)
コード例 #17
0
ファイル: varscan.py プロジェクト: hjanime/ngs_python
def filterSomatic(inFile,
                  outFile,
                  minCov=10,
                  minReads=2,
                  minStrands=1,
                  minAvgQ=10,
                  minVarFreq=0.1,
                  pValue=0.05,
                  indelFile=None,
                  javaPath='java',
                  varscanPath='varscan.jar'):
    '''
    1)  minCov - Minimum read depth.
    2)  minReads - Minimum supporting reads for a variant.
    3)  minStrands - Minimum number of strands on which variant observed.
    4)  minAvgQ - Minimum average base quality for variant-supporting reads.
    5)  minVarFreq - Minimum variant allele frequency threshold.
    6)  pValue - Default p-value threshold for calling variants.
    7)  indelFile - File of indels for filtering nearby SNPs.
    8)  outFile - Output file for filtered variants.
    
    '''
    # Check numerical arguments
    toolbox.check_var(minCov, 'int', mn=1)
    toolbox.check_var(minReads, 'int', mn=1)
    toolbox.check_var(minStrands, 'int', mn=1, mx=2)
    toolbox.check_var(minAvgQ, 'int', mn=2)
    toolbox.check_var(minVarFreq, 'num', gt=0, mx=1)
    toolbox.check_var(pValue, 'num', gt=0, mx=1)
    # Create command
    command = [
        javaPath, '-jar', varscanPath, 'somaticFilter', inFile,
        '--min-coverage',
        str(minCov), '--min-reads2',
        str(minReads), '--min-strands2',
        str(minStrands), '--min-avg-qual',
        str(minAvgQ), '--min-var-freq',
        str(minVarFreq), '--p-value',
        str(pValue), '--output-file', outFile
    ]
    # Append indel file if supplied
    if indelFile:
        command.extend(['--indel-file', indelFile])
    # Return command
    command = ' '.join(command)
    return command
コード例 #18
0
    --threads=<threads>  Number of threads [default: 4]
    --singleend          Only single-end sequencing performed
    --genomebam          Generate genome bam
    --markdup            Mark duplicates on genome bam
    --help               Output this message
    
"""
# Import required modules
import os
from ngs_python.fastq import fastqFind, fastqAlign
from general_python import docopt, toolbox, moab
# Extract and process arguments
args = docopt.docopt(__doc__,version = 'v1')
args['--threads'] = int(args['--threads'])
args['--forprob'] = float(args['--forprob'])
toolbox.check_var(args['--forprob'], 'num', mn = 0, mx = 1)
inDir, inPrefix = os.path.split(args['<inprefix>'])
outDir = os.path.join(args['<outdir>'], args['<samplename>'])
if not os.path.isdir(outDir):
    os.mkdir(outDir)
outPrefix = os.path.join(outDir, args['<samplename>'])
outLog = outPrefix + '.rsem.log'
# Create job dictionary

# Extract fastq files and generate output file names
read1, read2 = fastqFind.findFastq(prefix = inPrefix, dirList = [inDir],
    pair = True, gzip = True)
rsemCommand = fastqAlign.rsemBowtie2Align(index = args['<index>'],
    outPrefix = outPrefix, read1 = read1, read2 = read2,
    rsemPath = args['--rsem'], bowtie2Path = args['--bowtie2'],
    threads = args['--threads'], forProb = args['--forprob'],