Ejemplo n.º 1
0
def alignPE2sam(command, reference, fastq_file, pair_file, sai_fastq_file, sai_pair_file,
                        fastq_metadata, output_dir):
    '''
    Convert alignments to SAM format. Turn bwa sai alignments into a sam file.
    It uses bwa sampe commandline. (Pair End only)
    '''
    (path, name, ext) = splitPath(sai_fastq_file)
    (pathP, nameP, extP) = splitPath(sai_pair_file)
    if ext != '.sai' or extP != '.sai':
        sys.exit('alignPE2sam: one .sai file %s or %s does not have .sai extension' % (sai_fastq_file, sai_pair_file))

    sam_file = os.path.join(output_dir, os.path.splitext(os.path.basename(fastq_file))[0]) +  '.sam'
    sample =  fastq_metadata[os.path.basename(fastq_file)]['sample']
    run_id =  fastq_metadata[os.path.basename(fastq_file)]['run_id']
    lane =   fastq_metadata[os.path.basename(fastq_file)]['lane']
    identifier =  fastq_metadata[os.path.basename(fastq_file)]['identifier']
    readgroup_metadata = {'PL': 'ILLUMINA', 'SM': sample,
                            'LB': '%s_%s_%s_Lane%s' % (identifier, sample, run_id, lane),
                            'ID':  '%s_%s_%s_Lane%s' % (identifier, sample, run_id, lane) }
    metadata_str = make_metadata_string(readgroup_metadata)

    command = command % {'meta': metadata_str, 'ref': reference, 'align': sai_fastq_file, 'alignP': sai_pair_file,
                                'seq': fastq_file , 'pair': pair_file, 'out': sam_file}
    runCommand('bwa sampe alignment from fastq: %s' % sample, command)

    return sam_file
Ejemplo n.º 2
0
def align_with_mem(command, threads, reference, fastq_file, pair_file, fastq_metadata, output_dir):
    '''
    Perform alignment on two paired-end fastq files to a reference genome to produce a sam file.
    '''
    (path, name, ext) = splitPath(fastq_file)
    (pathP, nameP, extP) = splitPath(pair_file)

    if ext != '.fastq' or extP != '.fastq':
        sys.exit('align: one of the fastq file %s or %s does not have .fastq extension' % (fastq_file, pair_file))

    sam_file = os.path.join(output_dir, os.path.splitext(os.path.basename(fastq_file))[0]) +  '.sam'
    sample =  fastq_metadata[os.path.basename(fastq_file)]['sample']
    run_id =  fastq_metadata[os.path.basename(fastq_file)]['run_id']
    lane =   fastq_metadata[os.path.basename(fastq_file)]['lane']
    identifier =  fastq_metadata[os.path.basename(fastq_file)]['identifier']
    readgroup_metadata = {'PL': 'ILLUMINA', 'SM': sample,
                            'LB': '%s_%s_%s_Lane%s' % (identifier, sample, run_id, lane),
                            'ID':  '%s_%s_%s_Lane%s' % (identifier, sample, run_id, lane) }
    metadata_str = make_metadata_string(readgroup_metadata)

    command = command % {'threads': threads, 'meta': metadata_str, 'ref': reference,
                                'seq': fastq_file , 'pair': pair_file, 'out': sam_file}
    runCommand('bwa mem alignment from fastq: %s' % sample, command)

    return sam_file
Ejemplo n.º 3
0
def filterInputs():
    for chromName,chromLength in chromosomes:
        for fullPileup in fullPileups:
            (prefix,seqName,ext) = splitPath(fullPileup)
            varPileup = os.path.join(prefix, seqName + '.var.pileup')
            output = os.path.join(prefix, seqName + '.' + chromName + 'var_filtered.pileup')
            print([fullPileup, output, varPileup, chromName, chromLength, logger])
            yield([fullPileup, output, varPileup, chromName, chromLength, logger])
Ejemplo n.º 4
0
def convert2annovar(command, annovar_dir, vcf, output_dir):
    '''
    Convert vcf file to Annovar variant caller .annovar
    '''
    (path, name, ext) =  splitPath(vcf)
    if ext != '.vcf':
        sys.exit('Converting to .annovar: vcf file %s does not have .vcf extension' % vcf)
    out_prefix = os.path.join(output_dir, name.split('.')[0])
    command = command % {'out': out_prefix, 'vcf': vcf, 'annovardir': annovar_dir}
    runCommand('Coverting to .annovar format', command)

    return '.'.join([out_prefix, name.split('.')[0],  'avinput'])
Ejemplo n.º 5
0
def annotate(command, annovar_dir, annovar_file, output_dir):
    '''
    Annotate vcf using Annovar variant caller.
    '''
    (path, name, ext) =  splitPath(annovar_file)
    if ext != '.avinput':
        sys.exit('Annotating vcf: vcf file %s does not have .avinput extension' % annovar_file)
    out_prefix = os.path.join(output_dir, name)
    command = command % {'out': out_prefix, 'annovarfile': annovar_file, 'annovardir': annovar_dir}
    runCommand('Annotating with Annovar', command)

    return out_prefix
Ejemplo n.º 6
0
def filterInputs():
    for chromName, chromLength in chromosomes:
        for fullPileup in fullPileups:
            (prefix, seqName, ext) = splitPath(fullPileup)
            varPileup = os.path.join(prefix, seqName + '.var.pileup')
            output = os.path.join(
                prefix, seqName + '.' + chromName + 'var_filtered.pileup')
            print([
                fullPileup, output, varPileup, chromName, chromLength, logger
            ])
            yield ([
                fullPileup, output, varPileup, chromName, chromLength, logger
            ])
Ejemplo n.º 7
0
def summarize(command, annovar_dir, annovar_file, ver1000g, veresp, verdbsnp, genetype, buildver, output_dir):
    '''
    Summarize information with Annovar.
    '''
    (path, name, ext) =  splitPath(annovar_file)
    if ext != '.avinput':
        sys.exit('Summarizing annotations: vcf file %s does not have .avinput extension' % annovar_file)
    out_prefix = os.path.join(output_dir, name)
    command = command % {'out': out_prefix, 'annovarfile': annovar_file, 'annovardir': annovar_dir,
                'ver1000g': ver1000g, 'veresp': veresp, 'verdbsnp': verdbsnp, 'genetype': genetype, 'buildver': buildver}
    runCommand('Summarizing with Annovar', command)

    return out_prefix
Ejemplo n.º 8
0
def align(command, threads, reference, sequence, fastq_metadata, output_dir):
    '''
    Align sequence reads to the reference genome. This is the bwa's first stage, bwa aln.
    '''
    (path, name, ext) = splitPath(sequence)
    if ext != '.fastq':
        sys.exit('align: sequence file %s does not have .fastq extension' % sequence)
    alignment_file = os.path.join(output_dir, name + '.sai')
    command = command % {'out': alignment_file, 'threads': int(threads), 'ref': reference,
                    'seq': sequence, 'encodingflag': ''}
    runCommand('Running Alignment', command)

    return alignment_file
Ejemplo n.º 9
0
def base_qual_recal_tabulate(command, command_options, gatk_dir, reference, recal_file, alignment, output_dir):
    '''
    GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates.
    '''
    (path, name, ext) =  splitPath(alignment)
    command_options = command_options[1]
    if ext != '.bam':
        sys.exit('table recalibration: alignment file %s does not have .bam extension' % alignment)
    recal_bam = os.path.join(output_dir, name + '.recal.bam')
    command = command % {'jvmoptions': command_options, 'out': recal_bam, 'recalfile': recal_file,
                            'bam': alignment, 'gatkdir': gatk_dir, 'ref': reference + '.fasta'}
    runCommand('recalibrate base quality scores', command)

    return recal_bam
Ejemplo n.º 10
0
def base_qual_recal_count(command, command_options, gatk_dir, reference, dbsnp, alignment, output_dir):
    '''
    GATK CountCovariates, first step of base quality score recalibration.
    '''
    (path, name, ext) =  splitPath(alignment)
    command_options = command_options[1]
    if ext != '.bam':
        sys.exit('count covariates: alignment file %s does not have .bam extension' % alignment)
    recal_file = os.path.join(output_dir, name + '.recal_data.csv')
    command = command % {'jvmoptions': command_options, 'out': recal_file, 'dbsnp': dbsnp,
                            'bam': alignment, 'gatkdir': gatk_dir, 'ref': reference + '.fasta'}
    runCommand('count covariates for base quality score', command)

    return recal_file
Ejemplo n.º 11
0
def realign_intervals(command, command_options, gatk_dir, reference, alignment, output_dir):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment.
    """
    (path, name, ext) = splitPath(alignment)
    command_options = command_options[1]
    if not alignment.endswith('marked.bam'):
        sys.exit('calculating realignment intervals: alignment file %s does not have .bam extension' % alignment)
    interval_file = os.path.join(output_dir, name + '.bam.list')
    command = command % {'out': interval_file, 'bam': alignment, 'jvmoptions': command_options, \
        'gatkdir': gatk_dir, 'ref': reference + '.fasta'}
    runCommand('Calculating realignment intervals', command)

    return interval_file
Ejemplo n.º 12
0
def dedup(command, command_options, piccard_dir, alignment, output_dir):
    """
    Remove apparent duplicates using Picard MarkDuplicates
    """
    (path, name, ext) = splitPath(alignment)
    command_options = command_options[1]
    if ext != '.bam':
        sys.exit('mark pcr duplicates: alignment file %s does not have .bam extension' % alignment)
    marked_bam_file = os.path.join(output_dir, name + '.marked.bam')
    command = command % {'out': marked_bam_file, 'bam': alignment, 'jvmoptions': command_options, \
        'picarddir': piccard_dir, 'log': 'metrics'}
    runCommand('Marking PCR duplicates', command)

    return marked_bam_file
Ejemplo n.º 13
0
def samS2bam(command, command_options, threads, alignment, output_dir):
    """
    Convert sam to bam and sort, using Samtools.
    """
    (path, name, ext) = splitPath(alignment)
    command_options = command_options
    if ext != '.sam':
        sys.exit('sam2Sbam: alignment file %s does not have .sam extension' % alignment)
    bam_file = os.path.join(output_dir, name)
    command = command % {'out': bam_file, 'sam': alignment, 'max_mem': command_options,
                    'threads': threads}
    runCommand('Sam to Sorted Bam', command)

    return bam_file + '.bam'
Ejemplo n.º 14
0
def samP2bam(command, command_options, piccard_dir, alignment, output_dir):
    """
    Convert sam to bam and sort, using Picard.
    """
    (path, name, ext) = splitPath(alignment)
    command_options = command_options[1]
    if ext != '.sam':
        sys.exit('sam2bam: alignment file %s does not have .sam extension' % alignment)
    bam_file = os.path.join(output_dir, name + '.bam')
    command = command % {'out': bam_file, 'sam': alignment, 'jvmoptions': command_options,
            'picarddir': piccard_dir}
    runCommand('Sam to Sorted Bam', command)

    return bam_file
Ejemplo n.º 15
0
def filter_snps(command, command_options, gatk_dir, reference, vcf, filter_expression, output_dir):
    '''
    Use GATK VariantFiltration to filter raw SNP calls.
    '''
    (path, name, ext) =  splitPath(vcf)
    command_options = command_options[1]
    if ext != '.vcf':
        sys.exit('filtering SNPs: vcf file %s does not have .vcf extension' % vcf)
    out_vcf = os.path.join(output_dir, name + '.filtered.vcf')
    command = command % {'jvmoptions': command_options, 'out': out_vcf,
                    'vcf': vcf, 'gatkdir': gatk_dir, 'ref': reference + '.fasta', 'expression': filter_expression}
    runCommand('Calling snps', command)

    return out_vcf
Ejemplo n.º 16
0
def realign(command, command_options, gatk_dir, reference, alignment, intervals, output_dir):
    '''
    Run GATK IndelRealigner for local realignment, using intervals found by realign_intervals
    '''
    (path, name, ext) =  splitPath(alignment)
    command_options = command_options[1]
    if not intervals.endswith('bam.list') or ext != '.bam':
        sys.exit('local realignment with intervals: intervals file %s does not have .list extension' % alignment)
    realigned_bam = os.path.join(output_dir, name + '.realigned.bam')
    command = command % {'jvmoptions': command_options, 'ref': reference + '.fasta', 'out': realigned_bam,
                            'bam': alignment, 'gatkdir': gatk_dir, 'intervals': intervals}
    runCommand('Running local realignment around indels', command)

    return realigned_bam
Ejemplo n.º 17
0
def fix_mate(command, command_options, piccard_dir, alignment, output_dir):
    '''
    Fix mate information in paired end data using picard
    '''
    (path, name, ext) =  splitPath(alignment)
    command_options = command_options[1]
    if ext != '.bam':
        sys.exit('mate information fix: alignment file %s does not have .bam extension' % alignment)
    fixed_bam = os.path.join(output_dir, name + '.fixed.bam')
    command = command % {'jvmoptions': command_options, 'out': fixed_bam,
                            'bam': alignment, 'picarddir': piccard_dir}
    runCommand('Fixing Mate information', command)

    return fixed_bam
Ejemplo n.º 18
0
def indexbam(command, alignment, output_dir):
    '''
    Index alignment file (.bam) using Samtools
    '''
    (path, name, ext) = splitPath(alignment)
    if ext != '.bam':
        sys.exit('indexbam: alignment file %s does not have .bam extension' % alignment)
    command = command % {'bam': alignment}

    runCommand('Indexing alignment file', command)

    index_file = os.path.join(output_dir, name.replace('.bam', '.bai'))

    print command

    return index_file
Ejemplo n.º 19
0
def call_snps(command, command_options, threads, gatk_dir, reference, dbsnp, standard_emit_conf,
                standard_call_conf, dcov, alleles, alignment, output_dir):
    """
    Use GATK HaplotypeGenotyper to call SNPs from recalibrated bams.
    """
    (path, name, ext) =  splitPath(alignment)
    command_options = command_options[1]
    if ext != '.bam':
        sys.exit('call snp : alignment file %s does not have .bam extension' % alignment)
    out_vcf = os.path.join(output_dir, name + '.vcf')
    command = command % {'jvmoptions': command_options, 'out': out_vcf, 'dbsnp': dbsnp, 'alleles': alleles,
                            'threads': threads, 'scf': standard_call_conf, 'sec':standard_emit_conf,
                            'dcov': dcov, 'bam': alignment, 'gatkdir': gatk_dir, 'ref': reference + '.fasta'}
    runCommand('Calling snps', command)

    return out_vcf
Ejemplo n.º 20
0
    outFile = os.path.join(prefix,name)
    runStage('sortBam', logger, options, bamFile, outFile)

# Convert BAM alignment to pileup format.
#@transform(sortBam, suffix('.bam'), '.full.pileup', logger)
@transform(sortBam, regex(r'(.+)\.sorted\.bam'), r'\1.full.pileup', logger)
def pileupFull(bamFile, output, logger):
    runStage('pileupFull', logger, options, reference, bamFile, output)

# Call SNPs
#@transform(sortBam, suffix('.bam'), '.var.pileup', logger)
@transform(sortBam, regex(r'(.+)\.sorted\.bam'), r'\1.var.pileup', logger)
def callSNPs(bamFile, output, logger):
    runStage('callSNPs', logger, options, reference, bamFile, output)

(prefix,seqName,ext) = splitPath(sequences[0])
fullPileups = glob.glob(os.path.join(prefix, '*.full.pileup'))

def filterInputs():
    for chromName,chromLength in chromosomes:
        for fullPileup in fullPileups:
            (prefix,seqName,ext) = splitPath(fullPileup)
            varPileup = os.path.join(prefix, seqName + '.var.pileup')
            output = os.path.join(prefix, seqName + '.' + chromName + 'var_filtered.pileup')
            print([fullPileup, output, varPileup, chromName, chromLength, logger])
            yield([fullPileup, output, varPileup, chromName, chromLength, logger])

@follows(pileupFull)
@follows(callSNPs)
@files(filterInputs)
def varFilter(fullPileup, output, varPileup, chromName, chromLength, logger):
Ejemplo n.º 21
0
def pileup(baiFile, output, logger):
    (prefix, name, ext) = splitPath(baiFile)
    bamAlignFile = os.path.join(prefix, name)
    runStage('pileup', logger, options, reference, bamAlignFile, output)
Ejemplo n.º 22
0
def sortBam(bamFile, output, logger):
    (prefix, name, ext) = splitPath(output)
    outFile = os.path.join(prefix, name)
    runStage('sortBam', logger, options, bamFile, outFile)
Ejemplo n.º 23
0
def pileup(baiFile, output, logger):
    (prefix, name, ext) = splitPath(baiFile)
    bamAlignFile = os.path.join(prefix, name)
    runStage('pileup', logger, options, reference, bamAlignFile, output)
Ejemplo n.º 24
0
def sortBam(bamFile, output, logger):
    (prefix, name, ext) = splitPath(output)
    outFile = os.path.join(prefix,name)
    runStage('sortBam', logger, options, bamFile, outFile)
Ejemplo n.º 25
0
# Convert BAM alignment to pileup format.
#@transform(sortBam, suffix('.bam'), '.full.pileup', logger)
@transform(sortBam, regex(r'(.+)\.sorted\.bam'), r'\1.full.pileup', logger)
def pileupFull(bamFile, output, logger):
    runStage('pileupFull', logger, options, reference, bamFile, output)


# Call SNPs
#@transform(sortBam, suffix('.bam'), '.var.pileup', logger)
@transform(sortBam, regex(r'(.+)\.sorted\.bam'), r'\1.var.pileup', logger)
def callSNPs(bamFile, output, logger):
    runStage('callSNPs', logger, options, reference, bamFile, output)


(prefix, seqName, ext) = splitPath(sequences[0])
fullPileups = glob.glob(os.path.join(prefix, '*.full.pileup'))


def filterInputs():
    for chromName, chromLength in chromosomes:
        for fullPileup in fullPileups:
            (prefix, seqName, ext) = splitPath(fullPileup)
            varPileup = os.path.join(prefix, seqName + '.var.pileup')
            output = os.path.join(
                prefix, seqName + '.' + chromName + 'var_filtered.pileup')
            print([
                fullPileup, output, varPileup, chromName, chromLength, logger
            ])
            yield ([
                fullPileup, output, varPileup, chromName, chromLength, logger