Example #1
0
def sort(file_name, sorted_prefix=None):
    """ Sorts and indexes the bam file given by file_name.
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_sorted'

    sorted_name = sorted_prefix + ".bam"
    log_subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
Example #2
0
def output_as_tsv(vcf_path, out_path, output_gt_info=False):
    """ Outputs all of the information from the vcf file as one big tsv
    """
    out_file = open(out_path, 'w')
    if output_gt_info:
        log_subprocess.check_call(['vcf2tsv', vcf_path, '-g'], stdout=out_file)
    else:
        log_subprocess.check_call(['vcf2tsv', vcf_path], stdout=out_file)
    out_file.close()
Example #3
0
def output_setdiff_vcfs(vcf1_path, vcf2_path, genome, out_path):
    """ Outputs a VCF file which contains variants in the first but not the second VCF file.
    """
    out_file = open(out_path, 'w')
    ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa'
    log_subprocess.check_call(
        ['vcfintersect', vcf1_path, '-i', vcf2_path, '-v', '-r', ref_path],
        stdout=out_file)
    out_file.close()
Example #4
0
def output_intersect_vcfs(vcf1_path, vcf2_path, genome, out_path):
    """ Outputs a vcf which is the intersection of the two given vcfs.
    """
    ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa'
    out_file = open(out_path, 'w')
    log_subprocess.check_call(
        ['vcfintersect', vcf1_path, '-i', vcf2_path, '-r', ref_path],
        stdout=out_file)
    out_file.close()
Example #5
0
def sort_tabix_gtf(input_gtf, output_gtf):
    ''' Tabix a GTF. This is tricky because it needs to be sorted and block-gzipped first.
    NOTE: this will mess up the conventional ordering of genes/transcripts/exons/etc.'''
    cat = "zcat" if input_gtf.endswith(".gz") else "cat"
    log_subprocess.check_call(
        '{0} {1} | sort -k1,1 -k4,5n | bgzip -c > {2}'.format(
            cat, input_gtf, output_gtf),
        shell=True)
    log_subprocess.check_call("tabix -p gff {0}".format(output_gtf),
                              shell=True)
Example #6
0
def output_restrict_location_vcf(vcf_path, bed_path, genome, out_path):
    """ Outputs a vcf restricted to the locations specified in the given bed file
    """
    ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa'
    out_file = open(out_path, 'w')
    print ' '.join(['vcfintersect', '-b', bed_path, '-r', ref_path, vcf_path])
    log_subprocess.check_call(
        ['vcfintersect', '-b', bed_path, '-r', ref_path, vcf_path],
        stdout=out_file)
    out_file.close()
Example #7
0
def sort_by_name(file_name, sorted_prefix=None):
    """ Sorts a bam file by the read name, for paired-end
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_namesorted'

    sorted_name = sorted_prefix + '.bam'
    # NOTE -- need to update our internal samtools in order to use pysam.sort
    #pysam.sort('-n', file_name, sorted_prefix)
    log_subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])

    return pysam.Samfile(sorted_name, 'rb')
Example #8
0
def concatenate(out_file_name, all_in_file_names):
    """ Concatenate a list of bam files into a final output file """

    # Filter out empty BAM files -- these cause samtools cat to generate
    # a BAM with a premature end block
    in_file_names = [f for f in all_in_file_names if not bam_is_empty(f)]

    if len(in_file_names) > 1:
        args = ['samtools', 'cat', '-o', out_file_name]
        args.extend(in_file_names)
        log_subprocess.check_call(args)
    elif len(in_file_names) == 0:
        # If all the BAMs are empty, just copy 1 over
        shutil.copy(all_in_file_names[0], out_file_name)
    else:
        shutil.copy(in_file_names[0], out_file_name)
Example #9
0
def _merge_by_tag(output_bam, input_bams, tag, name=False, threads=1):
    # Note the original samtools merge call can
    # fail if the total length of the command line
    # gets too long. Use the -b option and pass
    # the input bam names as a file
    fofn = output_bam + ".fofn"
    if os.path.exists(fofn):
        raise RuntimeError("{} already exists".format(fofn))
    with open(fofn, "w") as fh:
        fh.write("\n".join(input_bams))

    args = ["samtools", "merge", "-c", "-p", "-s", "0"]
    if threads > 1:
        # the -@ specifies additional threads
        args.extend(["-@", str(threads - 1)])
    if name:
        args.append("-n")
    if tag is not None:
        args.extend(["-t", str(tag)])
    args.extend(["-b", fofn, output_bam])

    log_subprocess.check_call(args)
    os.remove(fofn)
Example #10
0
def sort_unique_tabix_vcf(vcf):
    ''' Sort, uniqueify, non-destructively bgzip, and tabix a VCF.'''
    tmp = vcf.rstrip('.vcf') + '.tmp.sorted.unique.vcf'
    log_subprocess.check_call('cat {0} | vcfstreamsort | vcfuniq > {1}'.format(
        vcf, tmp),
                              shell=True)
    subprocess.check_call('cp {0} {1}'.format(tmp, vcf), shell=True)
    subprocess.check_call('rm -f {0}'.format(tmp), shell=True)
    log_subprocess.check_call("bgzip -c {0} > {0}.gz".format(vcf), shell=True)
    log_subprocess.check_call("tabix -p vcf {0}.gz".format(vcf), shell=True)
Example #11
0
def bwa_index_ref(ref_fasta):
    """ Creates index of reference for bwa.  ref_fasta should be path to the reference fasta
    Only needs to be called once per reference.  Creates index files in the same directory as the
    reference
    """
    log_subprocess.check_call(['bwa', 'index', '-a', 'bwtsw', ref_fasta])
Example #12
0
def bwa_align_unpaired(ref_fasta,
                       read_fastq,
                       out_name,
                       algorithm='ALN',
                       max_hits=None,
                       read_group_header=None,
                       num_threads=24):
    """ Runs bwa aligner on reads without using paired-information (using bam as input format).
    """
    assert (type(read_fastq) != list)

    if read_group_header is None:
        read_group_header = tk_bam.make_rg_header()

    if algorithm == 'MEM':
        # Temp file names
        sam_name = out_name + '.sam'

        sam_out_file = open(sam_name, 'w')
        log_subprocess.check_call([
            'bwa', 'mem', '-t',
            str(num_threads), '-M', '-R', read_group_header, ref_fasta,
            read_fastq
        ],
                                  stdout=sam_out_file)
        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Remove temp files
        subprocess.check_call(['rm', sam_name])

    elif algorithm == 'ALN':
        # Temp file names
        sam_name = out_name + '.sam'
        index_name = out_name + '.sai'

        sam_out_file = open(sam_name, 'w')
        index_file = open(index_name, 'w')
        log_subprocess.check_call(
            ['bwa', 'aln', '-t',
             str(num_threads), ref_fasta, read_fastq],
            stdout=index_file)
        index_file.close()
        if max_hits:
            log_subprocess.check_call([
                'bwa', 'samse', '-n',
                str(max_hits), ref_fasta, index_name, read_fastq
            ],
                                      stdout=sam_out_file)
        else:
            log_subprocess.check_call(
                ['bwa', 'samse', ref_fasta, index_name, read_fastq],
                stdout=sam_out_file)
        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Remove temp files
        subprocess.check_call(['rm', index_name])
        subprocess.check_call(['rm', sam_name])
    else:
        raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
Example #13
0
def bwa_align_paired(ref_fasta,
                     read_fastq,
                     out_name,
                     algorithm='ALN',
                     max_hits=None,
                     read_group_header=None,
                     num_threads=24):
    """Runs bwa paired-end aligner on reads using paired-end information
    Algorithm choices are currently
    MEM: Maximal Exact Matching (better for longer reads)
    ALN: Better for longer reads
    Haven't yet implemented BWA-SW
    Currently assumes the input read_fastq is in interleaved format, i.e. the reads of a pair
    are alternating.
    """
    if read_group_header is None:
        read_group_header = tk_bam.make_rg_header()

    if algorithm == 'MEM':
        if type(read_fastq) == list:
            assert (len(read_fastq) == 2)
            ## This restricts to primary alignments only
            out_file = open(out_name, 'w')
            ps = log_subprocess.Popen([
                'bwa', 'mem', '-t',
                str(num_threads), '-M', '-R', read_group_header, ref_fasta,
                read_fastq[0], read_fastq[1]
            ],
                                      stdout=subprocess.PIPE)
            #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed
            errors_file = open(out_name + '_ERRORS', 'w')
            log_subprocess.check_call(['samtools', 'view', '-bSh', '-'],
                                      stdin=ps.stdout,
                                      stdout=out_file,
                                      stderr=errors_file)
            out_file.close()
            errors_file.close()
        else:
            ## This restricts to primary alignments only
            out_file = open(out_name, 'w')
            ps = log_subprocess.Popen([
                'bwa', 'mem', '-p', '-t',
                str(num_threads), '-M', '-R', read_group_header, ref_fasta,
                read_fastq
            ],
                                      stdout=subprocess.PIPE)
            #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed
            errors_file = open(out_name + '_ERRORS', 'w')
            log_subprocess.check_call(['samtools', 'view', '-bSh', '-'],
                                      stdin=ps.stdout,
                                      stdout=out_file,
                                      stderr=errors_file)
            out_file.close()
            errors_file.close()

    elif algorithm == 'ALN':
        # Temp file names
        temp_fastq_name1 = out_name + '1.fastq'
        temp_fastq_name2 = out_name + '2.fastq'
        index_name_1 = out_name + '1.sai'
        index_name_2 = out_name + '2.sai'
        sam_name = out_name + '.sam'

        # Create the temp non-interleaved files
        in_fastq = open(read_fastq, 'r')
        temp_fastq1 = open(temp_fastq_name1, 'w')
        temp_fastq2 = open(temp_fastq_name2, 'w')
        tk_fasta.uninterleave_fastq(in_fastq, temp_fastq1, temp_fastq2)
        temp_fastq1.close()
        temp_fastq2.close()

        # Create the bwa index files
        index_file_1 = open(index_name_1, 'w')
        index_file_2 = open(index_name_2, 'w')
        log_subprocess.check_call([
            'bwa', 'aln', '-t',
            str(num_threads), ref_fasta, temp_fastq_name1
        ],
                                  stdout=index_file_1)
        log_subprocess.check_call([
            'bwa', 'aln', '-t',
            str(num_threads), ref_fasta, temp_fastq_name2
        ],
                                  stdout=index_file_2)
        index_file_1.close()
        index_file_2.close()

        # Create the sorted SAM file
        sam_out_file = open(sam_name, 'w')
        if max_hits:
            log_subprocess.check_call([
                'bwa', 'sampe', '-n',
                str(max_hits), ref_fasta, index_name_1, index_name_2,
                temp_fastq_name1, temp_fastq_name2
            ],
                                      stdout=sam_out_file)
        else:
            log_subprocess.check_call([
                'bwa', 'sampe', ref_fasta, index_name_1, index_name_2,
                temp_fastq_name1, temp_fastq_name2
            ],
                                      stdout=sam_out_file)

        sam_out_file.close()

        # Create final bam file from the sam file
        tk_bam.convert_to_bam(sam_name, out_name)

        # Clean up temporary files
        subprocess.check_call(['rm', temp_fastq_name1])
        subprocess.check_call(['rm', temp_fastq_name2])
        subprocess.check_call(['rm', index_name_1])
        subprocess.check_call(['rm', index_name_2])
        subprocess.check_call(['rm', sam_name])
    else:
        raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
Example #14
0
def remove_dups(in_name, out_name):
    """ remove paired-end duplicates using samtools
    """
    log_subprocess.check_call(['samtools', 'rmdup', in_name, out_name])
Example #15
0
def merge_by_name(out_file_name, in_file_names):
    """ Merge name-sorted bam files into bam file sorted by name"""
    args = ['samtools', 'merge', '-n', out_file_name]
    args.extend(in_file_names)
    log_subprocess.check_call(args)
Example #16
0
def split_alt_alleles_vcf(vcf_path, out_path):
    """ Splits records with more than one ALT field into two
    """
    out_file = open(out_path, 'w')
    log_subprocess.check_call(['vcfbreakmulti', vcf_path], stdout=out_file)
Example #17
0
def output_primitives_vcf(vcf_path, out_path):
    """ Decomposes all complex variants into SNP and indel primitives
    """
    out_file = open(out_path, 'w')
    log_subprocess.check_call(['vcfallelicprimitives', vcf_path],
                              stdout=out_file)