Example #1
0
def indexRefbwa(inputs, outputs):
    """
    Index reference file for use with BWA.
    """
    ref = inputs
    flagFile = outputs
    runStageCheck('indexReferenceBWA', flagFile, ref)
Example #2
0
def fastqc(inputs, outputs):
    """
    Run FastQC on each fastq file.
    """
    sequence = inputs
    fastqc_dest, flagFile = outputs
    runStageCheck('fastqc', flagFile, fastqc_dir, sequence)
Example #3
0
def tumour_sam_to_bam(inputs, outputs):
    samFile, _success = inputs
    bamFile, flagFile = outputs

    match = re.search('(.*)\.sam', samFile)
    bamFile = match.group(1) + '.bam'
    runStageCheck('tumour_sam_to_bam', flagFile, samFile, bamFile)
Example #4
0
def simulate_variants(inputs, outputs):
    variant_fasta, _success = inputs
    fasta_out, flagFile = outputs
    match = re.search('(.*)\/([a-zA-Z0-9_\.]+)\.fa', variant_fasta)
    variant_fasta = '%s/%s' % (match.group(1), match.group(2))
    runStageCheck('simulate_variants', flagFile, javasim_libdir,
                  javasim_bindir, ref_fasta, variant_fasta)
Example #5
0
def tumour_sort_bam(inputs, outputs):
    bamFile, _success = inputs
    sortBamFile, flagFile = outputs

    match = re.search('(.*)\.bam', bamFile)
    sortBamPrefix = match.group(1) + '.sorted.bam'
    runStageCheck('tumour_sort_bam', flagFile, bamFile, sortBamPrefix)
Example #6
0
def fastqc_trimmed(inputs, outputs):
    """
    Run FastQC on each trimmed paired fastq file.
    """
    sequence = inputs
    fastqc_dest, flagFile = outputs
    runStageCheck('fastqc', flagFile, fastqc_dir, sequence)
Example #7
0
def indexRefSamtools(inputs, outputs):
    """
    Index reference file for use with samtools.
    """
    ref = inputs
    out, flagFile = outputs
    runStageCheck('indexReferenceSAM', flagFile, ref)
Example #8
0
def igvcountDedupedBams(inputs, outputs):
    """
    Use igvtools count to create a .tdf file for the deduped bam files, to improve viewing of the bam coverage in igv. Note that this actually goes from the fixMate-ed bams.
    """
    bam, _success = inputs
    outfile, flag_file = outputs
    print "igvtools count on %s" % os.path.basename(bam)
    runStageCheck('igvcount', flag_file, bam, outfile)
Example #9
0
def vcfIndexIndels(inputs, outputs):
    """
    Use bgzip and tabix to prepare raw indels vcf for vcftools handling.
    """
    vcf, _idx, _success = inputs
    zipfile, tabix_index, flag_file = outputs
    print "bgzip and tabix (for vcftools) on %s" % vcf
    runStageCheck('indexVCF', flag_file, vcf)
Example #10
0
def igvcountRecalibratedBams(inputs, outputs):
    """
    Use igvtools count to create a .tdf file for the recalibrated bam files, to improve viewing of the bam coverage in igv.
    """
    bam, _success = inputs
    outfile, flag_file = outputs
    print "igvtools count on %s" % os.path.basename(bam)
    runStageCheck('igvcount', flag_file, bam, outfile)
Example #11
0
def igvcountDedupedBams(inputs, outputs):
    """
    Use igvtools count to create a .tdf file for the deduped bam files, to improve viewing of the bam coverage in igv. Note that this actually goes from the fixMate-ed bams.
    """
    bam, _success = inputs
    outfile, flag_file = outputs
    print "igvtools count on %s" % os.path.basename(bam)
    runStageCheck('igvcount', flag_file, bam, outfile)
Example #12
0
def indexRealignedBams(inputs, outputs):
    """
    Index the locally realigned bams using samtools.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "samtools index on %s" % os.path.basename(bam)
    runStageCheck('indexBam', flag_file, bam)
Example #13
0
def indexRealignedBams(inputs, outputs):
    """
    Index the locally realigned bams using samtools.
    """
    bam = inputs
    output, flag_file = outputs
    print "samtools index on %s" % os.path.basename(bam)
    runStageCheck('indexBam', flag_file, bam)
Example #14
0
def vcfIndexIndels(inputs, outputs):
    """
    Use bgzip and tabix to prepare raw indels vcf for vcftools handling.
    """
    vcf, _idx, _success = inputs
    zipfile, tabix_index, flag_file = outputs
    print "bgzip and tabix (for vcftools) on %s" % vcf
    runStageCheck('indexVCF', flag_file, vcf)
Example #15
0
def countRunBam(inputs, outputs):
    """
    Run samtools flagstat on the initial per-lane, per-run bam file.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "Running samtools flagstat on %s" % bam
    runStageCheck('flagstat', flag_file, bam, output)
Example #16
0
def countDedupedBam(inputs, outputs):
    """
    Run samtools flagstat on the deduped bam file.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "Running samtools flagstat on %s" % bam
    runStageCheck('flagstat', flag_file, bam, output)
Example #17
0
def cleanup(inputs, outputs):
    bamIndex, _success = inputs
    flagFile = outputs

    tumour_base_name = '%s/%s' % (outdir, tumour_outname)
    normal_base_name = '%s/%s' % (outdir, normal_outname)

    runStageCheck('cleanup', flagFile, tumour_base_name, normal_base_name)
Example #18
0
def countRunBam(inputs, outputs):
    """
    Run samtools flagstat on the initial per-lane, per-run bam file.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "Running samtools flagstat on %s" % bam
    runStageCheck('flagstat', flag_file, bam, output)
Example #19
0
def countDedupedBam(inputs, outputs):
    """
    Run samtools flagstat on the deduped bam file.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "Running samtools flagstat on %s" % bam
    runStageCheck('flagstat', flag_file, bam, output)
Example #20
0
def igvcountRecalibratedBams(inputs, outputs):
    """
    Use igvtools count to create a .tdf file for the recalibrated bam files, to improve viewing of the bam coverage in igv.
    """
    bam, _success = inputs
    outfile, flag_file = outputs
    print "igvtools count on %s" % os.path.basename(bam)
    runStageCheck('igvcount', flag_file, bam, outfile)
Example #21
0
def indexMergedBams(inputs, outputs):
    """
    Index the merged bams using samtools.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "samtools index on %s" % os.path.basename(bam)
    runStageCheck('indexBam', flag_file, bam)
Example #22
0
def indexDedupedBams(inputs, outputs):
    """
    Index the de-duplicated bams using samtools. Note that this actually goes from the fixMate-ed bams.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "samtools index on %s" % os.path.basename(bam)
    runStageCheck('indexBam', flag_file, bam)
Example #23
0
def samToBam(inputs, outputs):
    """
    Convert sam to bam and sort, using Picard.
    """
    output, flag_file = outputs
    sam, _success = inputs
    print "converting to sorted bam: %s" % os.path.basename(sam)
    runStageCheck('samToSortedBam', flag_file, sam, output)
Example #24
0
def indexDedupedBams(inputs, outputs):
    """
    Index the de-duplicated bams using samtools. Note that this actually goes from the fixMate-ed bams.
    """
    bam, _success = inputs
    output, flag_file = outputs
    print "samtools index on %s" % os.path.basename(bam)
    runStageCheck('indexBam', flag_file, bam)
Example #25
0
def indexBaseQualRecalBam(inputs, outputs):
    """
    Index the locally realigned bams using samtools.
    """
    bam, _baseRecalBam_success = inputs
    output, flagFile = outputs
    print "samtools index on %s" % bam
    runStageCheck('indexBam', flagFile, bam)
Example #26
0
def samToBam(inputs, outputs):
    """
    Convert sam to bam and sort, using Picard.
    """
    output, flag_file = outputs
    sam, _success = inputs
    print "converting to sorted bam: %s" % os.path.basename(sam)
    runStageCheck('samToSortedBam', flag_file, sam, output)
Example #27
0
def BWAmem(inputs, outputs):
    """
    Align sequence reads to the reference genome using bwa mem.
    """
    seq1, seq2 = inputs
    output, flag_file = outputs
    print "bwa mem on %s" % os.path.basename(seq1)
    runStageCheck('BWAmem', flag_file, ref_files['bwa_reference'], seq1, seq2, output)
Example #28
0
def cmh2gwas(inputs, outputs):
    """
    Convert the results of the CMH test to GWAS format for viewing in IGV.
    """
    test_results = inputs
    gwas, flag_file = outputs
    logFile = mkLogFile(logDir, test_results, '.gwas.log')
    print "convert CMH results to GWAS: %s" % os.path.basename(test_results)
    runStageCheck('cmh2gwas', flag_file, test_results, gwas)
Example #29
0
def mpileuptosync(inputs,outputs):
    """
    Convert mpileup to sync format for use in Popoolation2
    """
    input_mpileup = inputs
    output_sync, flag_file = outputs
    logFile = mkLogFile(logDir, input_mpileup, '.sync.log')
    print "convert from mpileup to sync format: %s" % os.path.basename(input_mpileup)
    runStageCheck('mpileuptosync', flag_file, input_mpileup, output_sync)    
Example #30
0
def mpileup(inputs,outputs):
    """
    Mpileup  of dedupped, realigned bams - using samtools
    """
    bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs
    output_mpileup, flag_file = outputs
    logFile = mkLogFile(logDir, bam0, '.mpileup.log')
    print "Make mpileup using Samtools: %s and other 9 files" % os.path.basename(bam0)
    runStageCheck('mpileup', flag_file, ref_files['masked_reference'], bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, output_mpileup)    
Example #31
0
def selectHighQualVariants(inputs,outputs):
    """
    Select high quality variants from the initial FreeBayes variant calls to act as input for Base Quality Score Recalibration
    """
    input_vcf = inputs
    output_vcf, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.highqual.log')
    print "select high quality variants using GATK SelectVariants: %s" % os.path.basename(input_vcf)
    runStageCheck('selectHighQualVariants', flag_file, ref_files['fasta_reference'], input_vcf, output_vcf)
Example #32
0
def freebayes1(inputs,outputs):
    """
    First run of Freebayes, i.e. before Base Quality Score Recalibration
    """
    bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9 = inputs
    output_vcf, flag_file = outputs
    logFile = mkLogFile(logDir, bam1, '.freebayes.log')
    print "call variants using FreeBayes: %s" % os.path.basename(bam1)
    runStageCheck('freebayes1', flag_file, bam0, bam1, bam2, bam3, bam4, bam5, bam6, bam7, bam8, bam9, ref_files['fasta_reference'],  output_vcf)
Example #33
0
def filterIndels(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw INDEL calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log')
    print "filtering indels from %s" % input_vcf
    runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
Example #34
0
def dedup(inputs, outputs):
    """
    Remove apparent duplicates from merged bams using Picard MarkDuplicates.
    """
    input_bam, _success = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.dedup.log')
    print "de-duping %s" % os.path.basename(input_bam)
    runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment.
    """
    bam, _success = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(bam)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals)
Example #36
0
def filterHapVcfs(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw sample HAP calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log')
    # print "filtering haplotyper vcf from %s" % input_vcf
    runStageCheck('filterHapVcfs', flag_file, fasta_reference, input_vcf, logFile, output_vcf)
Example #37
0
def callVariantRecalibrator(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw SNP calls.
    """
    input_vcf, _idx, _success = inputs
    output_recal, output_tranches, output_R, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.VarRecal.log')
    print "VariantRecalibrator -> %s" % input_vcf
    runStageCheck('callVariantRecalibrator', flag_file, fasta_reference, input_vcf, ref_files['hapmap'],ref_files['omnimap'], ref_files['1kghc'], ref_files['dbsnp'], output_recal, output_tranches, output_R, logFile)
Example #38
0
def callHAP(inputs, outputs):
    """
    Use GATK HaplotypeCaller to call SNPs/Indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callHAPs.log')
    #print "calling haplotypes from %s" % bam
    runStageCheck('callHAP', flag_file, fasta_reference, bam, ref_files['dbsnp'], logFile, output_vcf)
Example #39
0
def baseQualRecalCount(inputs, outputs):
    """
    GATK CountCovariates, first step of base quality score recalibration.
    """
    bam, _success = inputs
    output_csv, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log')
    print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam)
    runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
Example #40
0
def callIndels(inputs, outputs):
    """
    Use GATK UnifiedGenotyper to call indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callIndels.log')
    print "calling Indels from %s" % bam
    runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
Example #41
0
def dedup(inputs, outputs):
    """
    Remove apparent duplicates from merged bams using Picard MarkDuplicates.
    """
    input_bam, _success = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.dedup.log')
    print "de-duping %s" % os.path.basename(input_bam)
    runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
Example #42
0
def getEnsemblAnnotations(inputs, outputs):
    """
    Annotate vcf using ENSEMBL variant effect predictor.
    """
    vcf, _idx, _success = inputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log')
    print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf)
    runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
Example #43
0
def leftAlign(inputs,outputs):
    """
    GATK LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it
    'command': "java -Xmx22g -jar " + GATK_HOME + "GenomeAnalysisTK.jar -allowPotentiallyMisencodedQuals -T LeftAlignIndels -I %input -R %ref -o %output"
    """
    bam, _realign_success = inputs
    output_bam, flagFile = outputs
    runStageCheck('leftalignindels', flagFile, bam, fasta_reference, output_bam)
    remove_GATK_bai(output_bam)
Example #44
0
def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment. 
    """
    input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23 = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam0, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(input_bam0)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], input_bam0, input_bam1, input_bam2, input_bam3, input_bam4, input_bam5, input_bam6, input_bam7, input_bam8, input_bam9, input_bam10, input_bam11, input_bam12, input_bam13, input_bam14, input_bam15, input_bam16, input_bam17, input_bam18, input_bam19, input_bam20, input_bam21, input_bam22, input_bam23, logFile, output_intervals)
Example #45
0
def generate_normal_reads(ref_fasta, outputs):
    fastq_r1, fastq_r2, flagFile = outputs
    normal_out = '%s/%s' % (outdir, normal_outname)
    cov = norm_cov + int(
        (tumour_cov /
         2))  #create normal component assuming half the "tumour" is normal
    runStageCheck('generate_reads_simseq', flagFile, simseq_dir,
                  javasim_libdir, read_len, frag_len, frag_std, ref_fasta,
                  normal_reads, normal_out)
Example #46
0
def callIndels(inputs, outputs):
    """
    Use GATK UnifiedGenotyper to call indels from recalibrated bams.
    """
    bam, _success = inputs
    output_vcf, _idx, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.callIndels.log')
    print "calling Indels from %s" % bam
    runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam,
                  ref_files['dbsnp'], logFile, output_vcf)
Example #47
0
def filterIndels(inputs, outputs):
    """
    Use GATK VariantFiltration to filter raw INDEL calls.
    """
    input_vcf, _idx, _success = inputs
    output_vcf, _idxout, flag_file = outputs
    logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log')
    print "filtering indels from %s" % input_vcf
    runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'],
                  input_vcf, logFile, output_vcf)
Example #48
0
def getEnsemblAnnotations(inputs, outputs):
    """
    Annotate vcf using ENSEMBL variant effect predictor.
    """
    vcf, _idx, _success = inputs
    output, flag_file = outputs
    logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log')
    print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(
        vcf)
    runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
Example #49
0
def collateReadCounts(inputs, outputs):
    """
    Collate read counts from samtools flagstat output into a table.
    """
    # Note expected input and output directories are effectively hard-coded
    in_dir = sambam_dir
    out_dir = results_dir
    flag_file = outputs[-1]
    print "Collating read counts"
    runStageCheck('collateReadcounts', flag_file, in_dir, out_dir)
Example #50
0
def mergeBams(inputs, outputs):
    """
    Merge the sorted bams together for each sample.
    Picard should cope correctly if there is only one input.
    """
    bams = [bam for [bam, _success] in inputs]
    output, flag_file = outputs
    baminputs = ' '.join(["INPUT=%s" % bam for bam in bams])
    print "merging %s into %s" % (",".join(
        [os.path.basename(bam) for bam in bams]), os.path.basename(output))
    runStageCheck('mergeBams', flag_file, baminputs, output)
Example #51
0
def realignIntervals(inputs, outputs):
    """
    Run GATK RealignTargetCreator to find suspect intervals for realignment.
    """
    bam, _success = inputs
    output_intervals, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.realignIntervals.log')
    print "calculating realignment intervals for %s" % os.path.basename(bam)
    runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'],
                  bam, ref_files['indels_realign_goldstandard'],
                  ref_files['indels_realign_1000G'], logFile, output_intervals)
Example #52
0
def realign(inputs, outputs):
    """
    Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals.
    """
    [intervals, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.realign.log')
    print "realigning %s" % os.path.basename(input_bam)
    runStageCheck('realign', flag_file, ref_files['fasta_reference'],
                  input_bam, intervals, logFile, output_bam)
    remove_GATK_bai(output_bam)
Example #53
0
def create_tumour_mixture(inputs, outputs):
    fastq_r1, fastq_r2, _success = inputs
    mix_r1, mix_r2, flagFile = outputs

    match = re.search(r'(.*)\/([a-zA-Z0-9_\.]+)_R(1|2)\.fq', fastq_r1)
    tumour_base = '%s/%s' % (match.group(1), match.group(2))
    norm_base = '%s/%s' % (outdir, normal_outname)
    mix_base = '%s/%s_mixture' % (match.group(1), match.group(2))

    runStageCheck('create_tumour_mixture', flagFile, tumour_base, norm_base,
                  mix_base)
Example #54
0
def finalDepthOfCoverage(inputs, outputs):
    """
    Use GATK DepthOfCoverage to get coverage statistics.
    """
    bam, _success = inputs
    flag_file = outputs[-1]
    output_example = outputs[0]
    output_base = os.path.splitext(output_example)[0]
    print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam
    runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'],
                  bam, output_base)
Example #55
0
def baseQualRecalCount(inputs, outputs):
    """
    GATK CountCovariates, first step of base quality score recalibration.
    """
    bam, _success = inputs
    output_csv, flag_file = outputs
    logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log')
    print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(
        bam)
    runStageCheck('baseQualRecalCount', flag_file, bam,
                  ref_files['fasta_reference'], ref_files['dbsnp'], logFile,
                  output_csv)
Example #56
0
def baseQualRecalTabulate(inputs, outputs):
    """
    GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates.
    """
    [input_csv, _success], [input_bam] = inputs
    output_bam, flag_file = outputs
    logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log')
    print "recalibrate base quality scores using GATK on %s" % os.path.basename(
        input_bam)
    runStageCheck('baseQualRecalTabulate', flag_file, input_bam,
                  ref_files['fasta_reference'], input_csv, logFile, output_bam)
    remove_GATK_bai(output_bam)
Example #57
0
def align_tumour_reads(inputs, outputs):
    fastq_r1, fastq_r2, _success = inputs
    samFile, flagFile = outputs
    outSam = '%s/%s.sam' % (outdir, tumour_outname)
    if aligner == 'bowtie':
        runStageCheck('align_tumour_reads_bowtie', flagFile, threads,
                      ref_fasta, fastq_r1, fastq_r2, outSam, tumour_outname,
                      tumour_outname)
    elif aligner == 'bwa':
        tumour_rg = '"@RG\\tID:%s\\t@SM:%s"' % (tumour_outname, tumour_outname)
        runStageCheck('align_tumour_reads_bwa', flagFile, threads, ref_fasta,
                      fastq_r1, fastq_r2, tumour_rg, outSam)
    else:
        raise ValueError('Invalid aligner specified!')
Example #58
0
def generate_tumour_reads(inputs, outputs):
    variant_fasta, _success = inputs
    fastq_r1, fastq_r2, flagFile = outputs

    match = re.search('(.*)\/([a-zA-Z0-9_\.]+)\.fa_reference.fa',
                      variant_fasta)
    tumour_out = '%s/%s' % (match.group(1), match.group(2))
    cov = int(tumour_cov / 2)  #assume variants occur only on one chromosome

    # have to tweak tumour reads based on generated reference genome
    proc = subprocess.Popen(["wc", variant_fasta], stdout=subprocess.PIPE)
    wc_out = proc.stdout.readline().split()
    tum_chrom_len = int(
        wc_out[2]) - 4  #4 = number of characters in the fasta header
    tumour_reads = (tum_chrom_len / frag_len) * int((tumour_cov / 2))

    runStageCheck('generate_reads_simseq', flagFile, simseq_dir,
                  javasim_libdir, read_len, frag_len, frag_std, variant_fasta,
                  tumour_reads, tumour_out)
Example #59
0
def bwaPE(inputs, outputs):
    """
    Aligns two paired-end fastq files to a reference genome to produce a sam file.
    """
    seq1, seq2 = sorted(inputs)
    output, flag_file = outputs
    fastq_name = os.path.basename(seq1)
    sample = fastq_metadata[fastq_name]['sample']
    runID = fastq_metadata[fastq_name]['run_id']
    lane = fastq_metadata[fastq_name]['lane']
    readgroup_metadata = {
        'PL': 'ILLUMINA',
        'SM': sample,
        'ID': "%s_%s_Lane%d" % (sample, runID, lane)
    }
    metadata_str = make_metadata_string(readgroup_metadata)
    print "bwa-mem on %s and %s" % (os.path.basename(seq1),
                                    os.path.basename(seq2))
    runStageCheck('bwaMemPE', flag_file, metadata_str,
                  ref_files['bwa_reference'], seq1, seq2, output)
Example #60
0
def variants_to_bed(inputs, outputs):
    variant_fasta, _success = inputs
    variant_bed, flagFile = outputs
    runStageCheck('variants_to_bed', flagFile, variant_fasta, variant_bed)