Beispiel #1
0
def makeAggregate(cells, directory, suffix, output):
    """
    Create aggregate sample.

    Make an aggregate bam file from a list of cells, sorts and indexes
    the file for easy use in IGV. Suffix is required to prevent non
    0-padded numbers matching the wrong files. Return final file name.

    Parameters
    ----------
    cells : list
        List of cell names to create aggregate from.
    directory : string
        Directory path with the bam files from each cell.
    suffix : string
        String to match the end of the bam file, use to add file extension
        and to anchor the extension after file numbers - this will prevent
        cell_4 matching cell_4*.
    output : string
        String containing output file location.
    """
    from glob import glob
    cells = set(cells)
    fileList = []
    for cell in cells:
        fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0])
    pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False)
    pysam.sort(output + ".bam", output + ".sorted", catch_stdout=False)
    pysam.index(output + ".sorted.bam", catch_stdout=False)

    return output + ".sorted.bam"
Beispiel #2
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('input')
    p.add_argument('output')
    p.add_argument('-i', '--min-identity', default=0.90, help="""Remove reads
            which match reference with < [value] [default: %(default)s]""", type=float)
    p.add_argument('-b', '--blast-contaminants', action='append')
    a = p.parse_args()
    logging.basicConfig(level=logging.INFO, format='[%(name)s] %(message)s')
    logger = logging.getLogger('low_identity')

    blast_contaminants = set()
    for f in a.blast_contaminants:
        with open(f) as fp:
            blast_contaminants |= set(parse_blast_contaminants(fp))

    dropped = 0
    processed = 0
    with contextlib.closing(pysam.Samfile(a.input, 'rb')) as input_bam:
        with contextlib.closing(pysam.Samfile(a.output, 'wb', template=input_bam)) as output_bam:
            for read in input_bam:
                processed += 1

                if read.qname in blast_contaminants:
                    dropped += 1
                    continue

                pct_id = 1.0 - read.opt('NM') / read.alen
                if pct_id < a.min_identity:
                    dropped += 1
                else:
                    output_bam.write(read)
    logger.info('Removed %d/%d [%0.2f%%]', dropped, processed, dropped / processed * 100)
    pysam.index(a.output)
    logger.info("Indexed.")
Beispiel #3
0
def processFiles(seqfile,threshold,width):
    # Need to keep this dictionary up-to-date with references you expect to see 
    gene_pos = {'1b_Con1_full_reference_seq':{'ns5b':{'nterm':7599,'cterm':9371,'seq':'TCGATGTCCTACACATGGACAGGCGCCCTGATCACGCCATGCGCTGCGGAGGAAACCAAGCTGCCCATCAATGCACTGAGCAACTCTTTGCTCCGTCACCACAACTTGGTCTATGCTACAACATCTCGCAGCGCAAGCCTGCGGCAGAAGAAGGTCACCTTTGACAGACTGCAGGTCCTGGACGACCACTACCGGGACGTGCTCAAGGAGATGAAGGCGAAGGCGTCCACAGTTAAGGCTAAACTTCTATCCGTGGAGGAAGCCTGTAAGCTGACGCCCCCACATTCGGCCAGATCTAAATTTGGCTATGGGGCAAAGGACGTCCGGAACCTATCCAGCAAGGCCGTTAACCACATCCGCTCCGTGTGGAAGGACTTGCTGGAAGACACTGAGACACCAATTGACACCACCATCATGGCAAAAAATGAGGTTTTCTGCGTCCAACCAGAGAAGGGGGGCCGCAAGCCAGCTCGCCTTATCGTATTCCCAGATTTGGGGGTTCGTGTGTGCGAGAAAATGGCCCTTTACGATGTGGTCTCCACCCTCCCTCAGGCCGTGATGGGCTCTTCATACGGATTCCAATACTCTCCTGGACAGCGGGTCGAGTTCCTGGTGAATGCCTGGAAAGCGAAGAAATGCCCTATGGGCTTCGCATATGACACCCGCTGTTTTGACTCAACGGTCACTGAGAATGACATCCGTGTTGAGGAGTCAATCTACCAATGTTGTGACTTGGCCCCCGAAGCCAGACAGGCCATAAGGTCGCTCACAGAGCGGCTTTACATCGGGGGCCCCCTGACTAATTCTAAAGGGCAGAACTGCGGCTATCGCCGGTGCCGCGCGAGCGGTGTACTGACGACCAGCTGCGGTAATACCCTCACATGTTACTTGAAGGCCGCTGCGGCCTGTCGAGCTGCGAAGCTCCAGGACTGCACGATGCTCGTATGCGGAGACGACCTTGTCGTTATCTGTGAAAGCGCGGGGACCCAAGAGGACGAGGCGAGCCTACGGGCCTTCACGGAGGCTATGACTAGATACTCTGCCCCCCCTGGGGACCCGCCCAAACCAGAATACGACTTGGAGTTGATAACATCATGCTCCTCCAATGTGTCAGTCGCGCACGATGCATCTGGCAAAAGGGTGTACTATCTCACCCGTGACCCCACCACCCCCCTTGCGCGGGCTGCGTGGGAGACAGCTAGACACACTCCAGTCAATTCCTGGCTAGGCAACATCATCATGTATGCGCCCACCTTGTGGGCAAGGATGATCCTGATGACTCATTTCTTCTCCATCCTTCTAGCTCAGGAACAACTTGAAAAAGCCCTAGATTGTCAGATCTACGGGGCCTGTTACTCCATTGAGCCACTTGACCTACCTCAGATCATTCAACGACTCCATGGCCTTAGCGCATTTTCACTCCATAGTTACTCTCCAGGTGAGATCAATAGGGTGGCTTCATGCCTCAGGAAACTTGGGGTACCGCCCTTGCGAGTCTGGAGACATCGGGCCAGAAGTGTCCGCGCTAGGCTACTGTCCCAGGGGGGGAGGGCTGCCACTTGTGGCAAGTACCTCTTCAACTGGGCAGTAAGGACCAAGCTCAAACTCACTCCAATCCCGGCTGCGTCCCAGTTGGATTTATCCAGCTGGTTCGTTGCTGGTTACAGCGGGGGAGACATATATCACAGCCTGTCTCGTGCCCGACCCCGCTGGTTCATGTGGTGCCTACTCCTACTTTCTGTAGGGGTAGGCATCTATCTACTCCCCAACCGA'}},
                '1a_H77_full_reference_seq':{'ns5b':{'nterm':7602,'cterm':9374, 'seq':'TCAATGTCTTATTCCTGGACAGGCGCACTCGTCACCCCGTGCGCTGCGGAAGAACAAAAACTGCCCATCAACGCACTGAGCAACTCGTTGCTACGCCATCACAATCTGGTGTATTCCACCACTTCACGCAGTGCTTGCCAAAGGCAGAAGAAAGTCACATTTGACAGACTGCAAGTTCTGGACAGCCATTACCAGGACGTGCTCAAGGAGGTCAAAGCAGCGGCGTCAAAAGTGAAGGCTAACTTGCTATCCGTAGAGGAAGCTTGCAGCCTGACGCCCCCACATTCAGCCAAATCCAAGTTTGGCTATGGGGCAAAAGACGTCCGTTGCCATGCCAGAAAGGCCGTAGCCCACATCAACTCCGTGTGGAAAGACCTTCTGGAAGACAGTGTAACACCAATAGACACTACCATCATGGCCAAGAACGAGGTTTTCTGCGTTCAGCCTGAGAAGGGGGGTCGTAAGCCAGCTCGTCTCATCGTGTTCCCCGACCTGGGCGTGCGCGTGTGCGAGAAGATGGCCCTGTACGACGTGGTTAGCAAGCTCCCCCTGGCCGTGATGGGAAGCTCCTACGGATTCCAATACTCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAAGTCCAAGAAGACCCCGATGGGGTTCTCGTATGATACCCGCTGTTTTGACTCCACAGTCACTGAGAGCGACATCCGTACGGAGGAGGCAATTTACCAATGTTGTGACCTGGACCCCCAAGCCCGCGTGGCCATCAAGTCCCTCACTGAGAGGCTTTATGTTGGGGGCCCTCTTACCAATTCAAGGGGGGAAAACTGCGGCTACCGCAGGTGCCGCGCGAGCGGCGTACTGACAACTAGCTGTGGTAACACCCTCACTTGCTACATCAAGGCCCGGGCAGCCTGTCGAGCCGCAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGACTTAGTCGTTATCTGTGAAAGTGCGGGGGTCCAGGAGGACGCGGCGAGCCTGAGAGCCTTCACGGAGGCTATGACCAGGTACTCCGCCCCCCCCGGGGACCCCCCACAACCAGAATACGACTTGGAGCTTATAACATCATGCTCCTCCAACGTGTCAGTCGCCCACGACGGCGCTGGAAAGAGGGTCTACTACCTTACCCGTGACCCTACAACCCCCCTCGCGAGAGCCGCGTGGGAGACAGCAAGACACACTCCAGTCAATTCCTGGCTAGGCAACATAATCATGTTTGCCCCCACACTGTGGGCGAGGATGATACTGATGACCCATTTCTTTAGCGTCCTCATAGCCAGGGATCAGCTTGAACAGGCTCTTAACTGTGAGATCTACGGAGCCTGCTACTCCATAGAACCACTGGATCTACCTCCAATCATTCAAAGACTCCATGGCCTCAGCGCATTTTCACTCCACAGTTACTCTCCAGGTGAAATCAATAGGGTGGCCGCATGCCTCAGAAAACTTGGGGTCCCGCCCTTGCGAGCTTGGAGACACCGGGCCCGGAGCGTCCGCGCTAGGCTTCTGTCCAGAGGAGGCAGGGCTGCCATATGTGGCAAGTACCTCTTCAACTGGGCAGTAAGAACAAAGCTCAAACTCACTCCAATAGCGGCCGCTGGCCGGCTGGACTTGTCCGGTTGGTTCACGGCTGGCTACAGCGGGGGAGACATTTATCACAGCGTGTCTCATGCCCGGCCCCGCTGGTTCTGGTTTTGCCTACTCCTGCTCGCTGCAGGGGTAGGCATCTACCTCCTCCCCAACCGA'}},
                'H77_genome':{'ns5b':{'nterm':7602,'cterm':9374,'seq':'TCAATGTCTTATTCCTGGACAGGCGCACTCGTCACCCCGTGCGCTGCGGAAGAACAAAAACTGCCCATCAACGCACTGAGCAACTCGTTGCTACGCCATCACAATCTGGTGTATTCCACCACTTCACGCAGTGCTTGCCAAAGGCAGAAGAAAGTCACATTTGACAGACTGCAAGTTCTGGACAGCCATTACCAGGACGTGCTCAAGGAGGTCAAAGCAGCGGCGTCAAAAGTGAAGGCTAACTTGCTATCCGTAGAGGAAGCTTGCAGCCTGACGCCCCCACATTCAGCCAAATCCAAGTTTGGCTATGGGGCAAAAGACGTCCGTTGCCATGCCAGAAAGGCCGTAGCCCACATCAACTCCGTGTGGAAAGACCTTCTGGAAGACAGTGTAACACCAATAGACACTACCATCATGGCCAAGAACGAGGTTTTCTGCGTTCAGCCTGAGAAGGGGGGTCGTAAGCCAGCTCGTCTCATCGTGTTCCCCGACCTGGGCGTGCGCGTGTGCGAGAAGATGGCCCTGTACGACGTGGTTAGCAAGCTCCCCCTGGCCGTGATGGGAAGCTCCTACGGATTCCAATACTCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAAGTCCAAGAAGACCCCGATGGGGTTCTCGTATGATACCCGCTGTTTTGACTCCACAGTCACTGAGAGCGACATCCGTACGGAGGAGGCAATTTACCAATGTTGTGACCTGGACCCCCAAGCCCGCGTGGCCATCAAGTCCCTCACTGAGAGGCTTTATGTTGGGGGCCCTCTTACCAATTCAAGGGGGGAAAACTGCGGCTACCGCAGGTGCCGCGCGAGCGGCGTACTGACAACTAGCTGTGGTAACACCCTCACTTGCTACATCAAGGCCCGGGCAGCCTGTCGAGCCGCAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGACTTAGTCGTTATCTGTGAAAGTGCGGGGGTCCAGGAGGACGCGGCGAGCCTGAGAGCCTTCACGGAGGCTATGACCAGGTACTCCGCCCCCCCCGGGGACCCCCCACAACCAGAATACGACTTGGAGCTTATAACATCATGCTCCTCCAACGTGTCAGTCGCCCACGACGGCGCTGGAAAGAGGGTCTACTACCTTACCCGTGACCCTACAACCCCCCTCGCGAGAGCCGCGTGGGAGACAGCAAGACACACTCCAGTCAATTCCTGGCTAGGCAACATAATCATGTTTGCCCCCACACTGTGGGCGAGGATGATACTGATGACCCATTTCTTTAGCGTCCTCATAGCCAGGGATCAGCTTGAACAGGCTCTTAACTGTGAGATCTACGGAGCCTGCTACTCCATAGAACCACTGGATCTACCTCCAATCATTCAAAGACTCCATGGCCTCAGCGCATTTTCACTCCACAGTTACTCTCCAGGTGAAATCAATAGGGTGGCCGCATGCCTCAGAAAACTTGGGGTCCCGCCCTTGCGAGCTTGGAGACACCGGGCCCGGAGCGTCCGCGCTAGGCTTCTGTCCAGAGGAGGCAGGGCTGCCATATGTGGCAAGTACCTCTTCAACTGGGCAGTAAGAACAAAGCTCAAACTCACTCCAATAGCGGCCGCTGGCCGGCTGGACTTGTCCGGTTGGTTCACGGCTGGCTACAGCGGGGGAGACATTTATCACAGCGTGTCTCATGCCCGGCCCCGCTGGTTCTGGTTTTGCCTACTCCTGCTCGCTGCAGGGGTAGGCATCTACCTCCTCCCCAACCGA'}},
                'JFH-1_genome':{'ns5b':{'nterm':7666,'cterm':9443,'seq':'CTCCATGTCATACTCCTGGACCGGGGCTCTAATAACTCCCTGTAGCCCCGAAGAGGAAAAGTTGCCAATCAACCCTTTGAGTAACTCGCTGTTGCGATACCATAACAAGGTGTACTGTACAACATCAAAGAGCGCCTCACAGAGGGCTAAAAAGGTAACTTTTGACAGGACGCAAGTGCTCGACGCCCATTATGACTCAGTCTTAAAGGACATCAAGCTAGCGGCTTCCAAGGTCAGCGCAAGGCTCCTCACCTTGGAGGAGGCGTGCCAGTTGACTCCACCCCATTCTGCAAGATCCAAGTATGGATTCGGGGCCAAGGAGGTCCGCAGCTTGTCCGGGAGGGCCGTTAACCACATCAAGTCCGTGTGGAAGGACCTCCTGGAAGACCCACAAACACCAATTCCCACAACCATCATGGCCAAAAATGAGGTGTTCTGCGTGGACCCCGCCAAGGGGGGTAAGAAACCAGCTCGCCTCATCGTTTACCCTGACCTCGGCGTCCGGGTCTGCGAGAAAATGGCCCTCTATGACATTACACAAAAGCTTCCTCAGGCGGTAATGGGAGCTTCCTATGGCTTCCAGTACTCCCCTGCCCAACGGGTGGAGTATCTCTTGAAAGCATGGGCGGAAAAGAAGGACCCCATGGGTTTTTCGTATGATACCCGATGCTTCGACTCAACCGTCACTGAGAGAGACATCAGGACCGAGGAGTCCATATACCAGGCCTGCTCCCTGCCCGAGGAGGCCCGCACTGCCATACACTCGCTGACTGAGAGACTTTACGTAGGAGGGCCCATGTTCAACAGCAAGGGTCAAACCTGCGGTTACAGACGTTGCCGCGCCAGCGGGGTGCTAACCACTAGCATGGGTAACACCATCACATGCTATGTGAAAGCCCTAGCGGCCTGCAAGGCTGCGGGGATAGTTGCGCCCACAATGCTGGTATGCGGCGATGACCTAGTAGTCATCTCAGAAAGCCAGGGGACTGAGGAGGACGAGCGGAACCTGAGAGCCTTCACGGAGGCCATGACCAGGTACTCTGCCCCTCCTGGTGATCCCCCCAGACCGGAATATGACCTGGAGCTAATAACATCCTGTTCCTCAAATGTGTCTGTGGCGTTGGGCCCGCGGGGCCGCCGCAGATACTACCTGACCAGAGACCCAACCACTCCACTCGCCCGGGCTGCCTGGGAAACAGTTAGACACTCCCCTATCAATTCATGGCTGGGAAACATCATCCAGTATGCTCCAACCATATGGGTTCGCATGGTCCTAATGACACACTTCTTCTCCATTCTCATGGTCCAAGACACCCTGGACCAGAACCTCAACTTTGAGATGTATGGATCAGTATACTCCGTGAATCCTTTGGACCTTCCAGCCATAATTGAGAGGTTACACGGGCTTGACGCCTTTTCTATGCACACATACTCTCACCACGAACTGACGCGGGTGGCTTCAGCCCTCAGAAAACTTGGGGCGCCACCCCTCAGGGTGTGGAAGAGTCGGGCTCGCGCAGTCAGGGCGTCCCTCATCTCCCGTGGAGGGAAAGCGGCCGTTTGCGGCCGATATCTCTTCAATTGGGCGGTGAAGACCAAGCTCAAACTCACTCCATTGCCGGAGGCGCGCCTACTGGACTTATCCAGTTGGTTCACCGTCGGCGCCGGCGGGGGCGACATTTTTCACAGCGTGTCGCGCGCCCGACCCCGCTCATTACTCTTCGGCCTACTCCTACTTTTCGTAGGGGTAGGCCTCTTCCTACTCCCCGCTCGGTAGA'}}}

    cpus = multiprocessing.cpu_count()
    local_path = os.getcwd()
    print "Beginning multiprocess indel QC with ",cpus," cpu's ...."
    
    ps.index(seqfile)
    bam = ps.Samfile(seqfile,'rb')
    outFASTQfile = open(bamfile+".indel_corrected.fastq",'w')
    ref = bam.references[0]
    read_pool = bam.fetch(bam.references[0], gene_pos[ref]['ns5b']['nterm'],gene_pos[ref]['ns5b']['cterm'])

    jobs = []
    for read in read_pool:
        p = multiprocessing.Process(target=assignWork, args=(read,gene_pos[ref]['seq'],local_path))
        jobs.append(p)
        p.start()
    for j in jobs:
        j.join()



    print "QC complete\n\n"
Beispiel #4
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam")
        localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted")

        samToBamFile(self.samFile, localBamFile)
        pysam.sort(localBamFile, localSortedBamFile)
        pysam.index(localSortedBamFile + ".bam")
        pysam.faidx(self.referenceFastaFile)
        
        file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] +  "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1]
        consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf")
        consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq")

        system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \
                % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf))
        system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq))
        system("rm -rf %s" % (self.referenceFastaFile + ".fai"))
        
        formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq")
        
        formatConsensusFastq(consensus_fastq, formatted_consensus_fastq)
        system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq))
        
        self.finish()
Beispiel #5
0
    def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):

        logging.debug('LastzRunner: running on reference %s and query %s' %
                     (reference_fasta_path, query_fasta_path))
        output_sam_path = os.path.abspath(
            os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
        output_bam_unsorted_path = os.path.abspath(
            os.path.expandvars(output_bam_path + '.unsorted'))

        logging.debug(
            'LastzRunner: aligning with output in temporary sam file %s' %
            output_sam_path)
        with open(output_sam_path, 'w') as output_sam_handler:
            for line in self._align(reference_fasta_path, query_fasta_path, multiple):
                output_sam_handler.write(line)

        logging.debug(
            'LastzRunner: transforming sam into unsorted bam file %s' %
            output_bam_unsorted_path)
        input_sam_handler = pysam.Samfile(output_sam_path, "r")
        output_bam_file = pysam.Samfile(
            output_bam_unsorted_path, "wb", template=input_sam_handler)

        logging.debug(
            'LastzRunner: copying from sam file to bam file')
        for s in input_sam_handler:
            output_bam_file.write(s)
        output_bam_file.close()

        logging.debug('LastzRunner: sorting and indexing bam file %s' %
                      output_bam_path)
        pysam.sort(output_bam_unsorted_path,
                   output_bam_path.replace('.bam', ''))

        pysam.index(output_bam_path)
Beispiel #6
0
def generate_bam_index(auxiliary_file_store_item_uuid, datafile_path):
    """
    Generate a bam_index file and associate it with the auxiliary
    FileStoreItem from our generate_auxiliary_file task
    :param auxiliary_file_store_item_uuid: uuid of FileStoreItem to generate
    auxiliary file for
    :type auxiliary_file_store_item_uuid: string
    :param datafile_path: Full path on disk to the datafile that we want to
    generate a bam index file for
    :type datafile_path: string
    """

    # Try and fetch the bam_index FileExtension
    # NOTE: that we are not handling the normal errors for an orm.get()s below
    # because we want the task from which this function is called within to
    # fail if we can't get what we want http://bit.ly/1KSbazM
    bam_index_file_extension = FileExtension.objects.get(name="bai").name
    auxiliary_file_store_item = FileStoreItem.objects.get(
        uuid=auxiliary_file_store_item_uuid)

    # Leverage pysam library to generate bam index file
    # FIXME: This should be refactored once we don't have a need for
    # Standalone IGV because this is creating a bam_index file in the same
    # directory as it's bam file
    pysam.index(bytes(datafile_path))

    # Map source field of FileStoreItem to path of newly created bam index file
    auxiliary_file_store_item.source = "{}.{}".format(
        datafile_path, bam_index_file_extension)

    auxiliary_file_store_item.set_filetype(bam_index_file_extension)
    auxiliary_file_store_item.save()

    # Symlink the newly created bam index datafile
    auxiliary_file_store_item.symlink_datafile()
Beispiel #7
0
def saveReads(dataHub, nameExtra=None):
    if dataHub.args.save_reads:
        logging.info("* Saving relevant reads *")
        for i, sample in enumerate(dataHub):
            outbam_path = dataHub.args.save_reads
            if not outbam_path.endswith(".bam"):
                outbam_path += ".bam"

            if len(dataHub.samples) > 1:
                logging.debug("Using i = {}".format(i))
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i))

            if nameExtra is not None:
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra))

            logging.info("  Outpath: {}".format(outbam_path))

            # print out just the reads we're interested for use later
            bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam)
            for read in sample.reads:
                bam_small.write(read)

            for read in sample.readStatistics.reads:
                bam_small.write(read)

            bam_small.close()
            sorted_path = outbam_path.replace(".bam", ".sorted")
            pysam.sort(outbam_path, sorted_path)
            pysam.index(sorted_path+".bam")
 def populate(self, sam_file_name, minimum_alignment_score):
     if self.contig == "":
         RuntimeError("contig must be set before reading a bam file")
     if self.contig[0]==">":
         current_contig_to_analyse = self.contig.lstrip('>') #Necessary because there is no ">" in the bam file...
     else:
         current_contig_to_analyse = self.contig
     sys.stderr.write("Loading file %s\n" %sam_file_name)
     samfile = pysam.Samfile(sam_file_name, 'rb')
     if not samfile._hasIndex(): #if no index, we must build it
         samfile.close()
         sys.stderr.write("Building index for %s\n" % sam_file_name)
         pysam.index(sam_file_name)
         samfile = pysam.Samfile(sam_file_name, 'rb')
     if self.position-3 < 0:
         sys.stderr.write("%s position %s. I have problem computing this position\n" % (self.contig, self.position))
     for pileup_data in samfile.pileup(current_contig_to_analyse, max([0,self.position-3]), self.position+1):
         #print(str(self.position-3)+" "+str(pileup_data.pos)+" "+str(self.position+1))
         if self.position-3 <= pileup_data.pos <= self.position+1:
             #print('in')
             for pileup_read in pileup_data.pileups:
                 if not pileup_read.alignment.qname in self.reads:
                     self.reads[pileup_read.alignment.qname] = {}
                 if ord(pileup_read.alignment.qual[pileup_read.qpos])-33 > minimum_alignment_score:
                         self.reads[pileup_read.alignment.qname][int(pileup_data.pos+1)] = \
                             pileup_read.alignment.seq[pileup_read.qpos] #using biological position, not python.
     samfile.close()
Beispiel #9
0
def tophat_map(gtf, out_dir, prefix, fastq, thread, bw=False, scale=False,
               gtf_flag=1):
    '''
    1. Map reads with TopHat2
    2. Extract unmapped reads
    3. Create BigWig file if needed
    '''
    # tophat2 mapping
    print('Map reads with TopHat2...')
    tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 '
    if gtf_flag:
        tophat_cmd += '-G %s ' % gtf
    tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat')
    tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + ','.join(fastq)
    tophat_cmd += ' 2> %s/tophat.log' % out_dir
    print('TopHat2 mapping command:')
    print(tophat_cmd)
    return_code = os.system(tophat_cmd) >> 8
    if return_code:
        sys.exit('Error: cannot map reads with TopHat2!')
    # extract unmapped reads
    print('Extract unmapped reads...')
    unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir)
    unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir)
    # create Bigwig file if needed
    if bw and which('bedGraphToBigWig') is not None:
        print('Create BigWig file...')
        map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir
        # index bam if not exist
        if not os.path.isfile(map_bam_fname + '.bai'):
            pysam.index(map_bam_fname)
        map_bam = pysam.AlignmentFile(map_bam_fname, 'rb')
        # extract chrom size file
        chrom_size_fname = '%s/tophat/chrom.size' % out_dir
        with open(chrom_size_fname, 'w') as chrom_size_f:
            for seq in map_bam.header['SQ']:
                chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN']))
        if scale:  # scale to HPB
            mapped_reads = map_bam.mapped
            for read in map_bam:
                read_length = read.query_length
                break
            s = 1000000000.0 / mapped_reads / read_length
        else:
            s = 1
        map_bam = pybedtools.BedTool(map_bam_fname)
        bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir
        with open(bedgraph_fname, 'w') as bedgraph_f:
            for line in map_bam.genome_coverage(bg=True, g=chrom_size_fname,
                                                scale=s, split=True):
                value = str(int(float(line[3]) + 0.5))
                bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value)
        bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir
        return_code = os.system('bedGraphToBigWig %s %s %s' %
                                (bedgraph_fname, chrom_size_fname,
                                 bigwig_fname)) >> 8
        if return_code:
            sys.exit('Error: cannot convert bedGraph to BigWig!')
    else:
        print('Could not find bedGraphToBigWig, so skip this step!')
Beispiel #10
0
def splitByStrand(bamfile, pe):
    
    bam_prefix = bamfile.split(".bam")[0]
    
    if pe:
        flags = [('-f 0x40 -F 0x10', 'plus'), ('-f 0x40 -F 0x20', 'minus')]
        cmd_args = [['samtools', 'view',
                 '-b', flag[0], bamfile,
                 bam_prefix + "_" + flag[1] + ".bam"]for flag in flags]
    else:
        flags = [('-F 0x10', 'plus'), ('-f 0x10', 'minus')]
        cmd_args = [['samtools', 'view',
                 '-b', flag[0], bamfile,
                 bam_prefix + "_" + flag[1] + ".bam"]for flag in flags]
    
    
    
    for cmd_arg in cmd_args:
        print cmd_arg
        if os.path.exists(cmd_arg[5]): 
            continue
        outfile = open(cmd_arg[5], 'w')
        p = Popen(cmd_arg[:5], stdout=outfile)
        p.wait()
        pysam.index(cmd_arg[5])
    
    # Return split BAM names
    return([cmd_arg[5] for cmd_arg in cmd_args])
Beispiel #11
0
def indexed_bam(bam_filename):
    import pysam
    if not os.path.exists(bam_filename + ".bai"):
        pysam.index(bam_filename)
    sam_reader = pysam.Samfile(bam_filename, "rb")
    yield sam_reader
    sam_reader.close()
Beispiel #12
0
def convert_sam_to_bam():
    """
    This method should take a newly create .sam file from alignment and
        - convert it to .bam
        - sort .bam
        - index .bam
    """
    ids = generate_ids()
    for id in ids:
        start_time = time()
        print 'converting: %s'%id
        base_path = os.path.join(SAMPLE_DIR, id)
        sam_path = os.path.join(base_path, id+'-bwape.sam')
        bam_path = os.path.join(base_path, id+'-bwape.bam')

        bam_content = pysam.view('-bS', sam_path)
        bam_file = open(bam_path, 'w+')
        bam_file.writelines(bam_content)
        bam_file.close()

        pysam.sort(bam_path, bam_path+'_sorted')
        pysam.index(bam_path+'_sorted.bam')

        # indexing creates file.bam.bam. Move it to file.bam
        bam_call = "mv {0} {1}".format(bam_path+'_sorted.bam', bam_path)
        index_call = "mv {0} {1}".format(bam_path+'_sorted.bam.bai',
                                         bam_path+'.bam.bai')
        subprocess.call(bam_call, shell=True)
        subprocess.call(index_call, shell=True)
        end_time = time()
        print 'completed: %.3fs'%(end_time-start_time)
def sort_bam(in_bam, sort_fn, to_include=None):
    out_file = "%s-ksort%s" % os.path.splitext(in_bam)
    index_file = "%s.bai" % in_bam
    if not os.path.exists(index_file):
        pysam.index(in_bam)

    orig = pysam.Samfile(in_bam, "rb")
    chroms = [(c["SN"], c) for c in orig.header["SQ"]]
    new_chroms = chroms[:]
    if to_include:
        new_chroms = [(c, x) for (c, x) in new_chroms if c in to_include]
    new_chroms.sort(sort_fn)
    remapper = _id_remapper(chroms, new_chroms)
    new_header = orig.header
    new_header["SQ"] = [h for (_, h) in new_chroms]

    new = pysam.Samfile(out_file, "wb", header=new_header)
    for (chrom, _) in new_chroms:
        for read in orig.fetch(chrom):
            write = True
            read.rname = remapper[read.rname]
            try:
                read.mrnm = remapper[read.mrnm]
            # read pair is on a chromosome we are not using
            except KeyError:
                assert to_include is not None
                write = False
            if write:
                new.write(read)
Beispiel #14
0
def extractRegion(bamfile,start,stop,output,exact):
    pysam.index(bamfile)                # must create a .bai index for any bam file to be read or fetch won't work
    bam = pysam.Samfile(bamfile,'rb')   # and must be done before bamfile is opened
    ref = bam.references[0]             # Get name of reference reads aligned to in bam
    outfile = open(bamfile+".extracted."+output,'w')

    # Get the reads in region of interest
    read_pool = bam.fetch(bam.references[0], start,stop)
    
    # Process reads
    for read in read_pool:
        if exact != '':
            if read.pos <= start and read.aend >= stop:
                if output == 'fastq':
                    outfile.write(writeFastQ(read))
                elif output =='fasta':
                    output.write(writeFastA(read))
        else:
            if output == 'fastq':
                outfile.write(writeFastQ(read))
            elif output == 'fasta':
                outfile.write(writeFastA(read))
        

    outfile.close()
    return
Beispiel #15
0
def extractRegion(bamfile,start,stop,output):
    pysam.index(bamfile)                # must create a .bai index for any bam file to be read or fetch won't work
    bam = pysam.Samfile(bamfile,'rb')   # and must be done before bamfile is opened
    ref = bam.references[0]             # Get name of reference reads aligned to in bam
    outfile = open(bamfile+".extracted."+output,'w')

    # Get the reads in region of interest
    read_pool = bam.fetch(bam.references[0], start,stop)
    
    # Process reads
    for read in read_pool:
        if read.is_reverse == True:                     # all reverse reads in a bam file have been reverse 
            seq = Seq(read.query)                       # complemented already so they need to be reverse 
            rc = seq.reverse_complement().tostring()    # complemented again, along with the quality scores
            rq = reverseString(read.qqual)              # to write correctly to the fastq
            if output == 'fastq':
                outfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n")
            elif output == 'fasta':
                outfile.write('>'+read.qname+'\n'+rc+'\n')
        else:
            if output == 'fastq':
                outfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n")
            elif output == 'fasta':
                outfile.write('>'+read.qname+'\n'+read.query+'\n')

    outfile.close()
    return
Beispiel #16
0
def read_directions_count(bam_file):
    """
    get the reads directions count from a bam file 

    @args bam_file: binary file formt for storing sequencing reads information
    @type bam_file: str 
    """

    ## indexing the in bam file 
    if not os.path.exists(bam_file + ".bai"):
        pysam.index(bam_file) 

    reverse_cnt = 0 
    forward_cnt = 0

    bam_fh = pysam.Samfile(bam_file, "rb") 

    for read in bam_fh.fetch():
         if read.is_proper_pair and read.is_read1:
            if read.is_reverse:
                reverse_cnt += 1
            else:
                forward_cnt += 1 

    bam_fh.close() 
    return {'forward_reads_count': forward_cnt, 'reverse_reads_count': reverse_cnt} 
Beispiel #17
0
def check_bam(bam, p, make_new_index=False):
    """
    Sort and index bam file
    returns dictionary of chromosome names and lengths
    """
    # check if sorted
    test_head = pysam.AlignmentFile(bam, 'rb')
    chrom_sizes = {}
    p = str(p)
    for i in test_head.header['SQ']:
        chrom_sizes[i['SN']] = int(i['LN'])
    try:
        test_head.header['HD']['SO']
    except KeyError:
        print '  sorting bam file'
        pysam.sort('-@', p, bam, 'sorted.temp')
        os.remove(bam)
        os.rename('sorted.temp.bam', bam)
    else:
        if test_head.header['HD']['SO'] == 'coordinate':
            pass
        else:
            print '  sorting bam file'
            pysam.sort('-@', p, bam, 'sorted.temp')
            os.remove(bam)
            os.rename('sorted.temp.bam', bam)
    test_head.close()
    # check if indexed
    if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False:
        pass
    else:
        print '  indexing bam file'
        pysam.index(bam)
    return chrom_sizes
Beispiel #18
0
def main(**args):
    dbname = args["genomedatabase"]
    samplefilename = args["samplefile"]
    sampledata = samplefile(args["samplefile"])
    expname = args["expname"]
    trackdir = expname+"/trackhub"
    scriptdir = os.path.dirname(os.path.realpath(sys.argv[0]))+"/"

    if not os.path.exists(trackdir):
        os.makedirs(trackdir)
    allsamples = sampledata.getsamples()
    for currsample in allsamples:
        currbam = sampledata.getbam(currsample)
        genomebam = currsample+"-genome.bam"
        convertbam(dbname, currbam, genomebam, scriptdir, force = True)
    
    faidxjob = subprocess.Popen("samtools faidx "+dbname+"-tRNAgenome.fa",shell = True)
    faidxjob.wait()
    for currrep in sampledata.allreplicates():
        repsamples = sampledata.getrepsamples(currrep)
        samtoolsmerge(list(curr+"-genome.bam" for curr in repsamples), currrep+"-mergegenome.bam", True)
        pysam.index(currrep+"-mergegenome.bam")
        makebigwigs(currrep+"-mergegenome.bam", currrep, dbname+"-tRNAgenome.fa.fai",trackdir)
    
    createtrackdb(sampledata.allreplicates(),expname)
Beispiel #19
0
def sort_by_position(bam_file, dir):

    ## get the file prefix
    prefix = ""
    prefix_match = re.match(r"(.*).bam", bam_file)

    try:
        prefix = prefix_match.group(1)
    except:
        print "Existing: Invalid bam file -i %s" %(bam_file)
        sys.exit(2)
        

    # sort the bam file
    bam_input = dir + bam_file
    sort_bam = dir +  prefix + "_sorted"
    pysam.sort(bam_input, sort_bam)
    sort_bam = sort_bam + ".bam"
    
    # index the sort bam file
    pysam.index(sort_bam)

    print ""
    print "Writing Sorted Bam File : %s" %(sort_bam)
    print "Writing Index Sorted Bam File : %s.bai" %(sort_bam)
    
    return sort_bam
def sort_output(outPrefix):
    '''Sorts the output file by read coordinate'''
    pysam.sort(outPrefix+'.originalSort.bam', outPrefix + '.coordSort')    
    #os.remove(outPrefix+'.originalSort.tmp.bam')
    
    ## Build the bam index for output    
    pysam.index(outPrefix + '.coordSort.bam')
Beispiel #21
0
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path):
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")

    null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null)
    with open(pe1_output, "w") as pe1_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null)

    with open(pe2_output, "w") as pe2_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null)

    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "sampe",
                                "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout=bwa_file, stderr=null)

    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
Beispiel #22
0
def bwa_mem(pe1_path, pe2_path, genome_path, threads, output_path):
    print 'Aligning with bwa mem'
    start = time()
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")
    stderr_file = open(output_path+'.bwa.1','w')

    #null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=stderr_file)
    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "mem", "-t", threads,
                                genome_db, pe1_path, pe2_path ],
                              stdout=bwa_file,
                              stderr=stderr_file)

    elapsed = time() - start
    print 'Time elapsed for bwa mem: ', elapsed
    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
   
    shutil.rmtree(work_dir)
Beispiel #23
0
def bamFile(tmpdir_factory):
    header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}] }
    p=tmpdir_factory.mktemp('test').join('test.bam')
    outFile=pysam.AlignmentFile(str(p),"wb",header=header)
    a = pysam.AlignedSegment()
    a.query_name = "read3"
    a.query_sequence="GGGGAAAAAT"
    a.reference_start = 28
    a.reference_id = 0
    a.mapping_quality = 20
    a.cigar = ((0,10), )
    #a.query_qualities = pysam.qualitystring_to_array("((((((((((")
    a.flag=16
    outFile.write(a)
    a.query_name = "read2"
    a.reference_start = 32
    a.query_sequence="AAAAATTTTT"
    a.flag=0
    outFile.write(a)
    a.query_name = "read1"
    a.query_sequence="TTAAAAACCCCCGGC"
    #a.query_qualities = pysam.qualitystring_to_array("(((((((((((((")
    a.cigar = ((5,5),(4,2),(0,10), (2,2),(0,1),(1,1),(0,1))
    outFile.write(a)
    outFile.close()
    pysam.index(str(p))
    return(p)
def run_cufflinks(org_db, num_cpus=4):
    """
    run cufflinks program on mapped reads 
    """

    try:
        subprocess.call(["cufflinks"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `Cufflinks` binary is in your $PATH")
    
    org_name = org_db['short_name'] 
    print("preparing for cufflinks run for organism %s" % org_name)

    min_intron_length = 20
    min_isoform_frac = 0.25
    max_intron_length = org_db['max_intron_len']
    result_dir = org_db['read_assembly_dir']

    bam_file = "%s/%s_Aligned_mmr_sortbyCoord.bam" % (org_db['read_map_dir'], org_name)
    if not os.path.isfile(bam_file):
        sys.stdout.write("failed to fetch sorted mmr BAM file for organism: %s, trying to get the mmr file...\n" % org_name)
        bam_file = "%s/%s_Aligned_mmr.bam" % (org_db['read_map_dir'], org_name)
        if not os.path.isfile(bam_file):
            exit("error: failed to fetch mmr BAM file for organism %s" % org_name)
        
        ## sorting, indexing the bam file 
        file_prefix, ext = os.path.splitext(bam_file)
        sorted_bam = "%s_sortbyCoord" % file_prefix

        sys.stdout.write("trying to sort based by the coordinates with output prefix as: %s\n" % sorted_bam)
        if not os.path.isfile("%s.bam" % sorted_bam):
            pysam.sort(bam_file, sorted_bam)
            
        bam_file = "%s.bam" % sorted_bam

    print('using bam file from %s' % bam_file)
    if not os.path.exists(bam_file + ".bai"):
        pysam.index(bam_file) 

    ## always use quiet mode to avoid problems with storing log output.
    cli_cuff = "cufflinks -q --no-update-check \
        -F %.2f \
        -I %d \
        --min-intron-length %d \
        --library-type fr-unstranded \
        -p %d \
        -o %s \
        %s" % (min_isoform_frac, max_intron_length, min_intron_length, num_cpus, result_dir, bam_file)
  
    sys.stdout.write('\trun cufflinks as: %s \n' % cli_cuff)
    try:
        os.chdir(result_dir)
        process = subprocess.Popen(cli_cuff, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

    except Exception, e:
        print 'Error running cufflinks.\n%s' %  str( e )
def buildSimpleNormalizedBAM(infiles, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling
    '''
    infile, countfile = infiles

    pysam_in = pysam.Samfile(infile, "rb")

    fh = IOTools.openFile(countfile, "r")
    readcount = int(fh.read())
    fh.close()

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))
Beispiel #26
0
    def addReadGroupSet(self, datasetName, filePath, moveMode):
        """
        Add a read group set to the repo
        """
        # move the bam file
        self._check()
        self._checkDataset(datasetName)
        self._checkFile(filePath, self.bamExtension)
        fileName = os.path.basename(filePath)
        readGroupSetName = filenameWithoutExtension(
            fileName, self.bamExtension)
        destPath = os.path.join(
            self._repoPath, self.datasetsDirName, datasetName,
            self.readsDirName, fileName)
        self._assertPathEmpty(destPath, inRepo=True)
        self._moveFile(filePath, destPath, moveMode)

        # move the index file if it exists, otherwise do indexing
        indexPath = os.path.join(
            os.path.split(filePath)[0],
            readGroupSetName + self.bamIndexExtension)
        indexMessage = ""
        if os.path.exists(indexPath):
            dstDir = os.path.split(destPath)[0]
            self._moveFile(
                indexPath,
                os.path.join(dstDir, os.path.basename(indexPath)),
                moveMode)
        else:
            pysam.index(destPath.encode('utf-8'))
            indexMessage = " (and indexed)"

        # finish
        self._repoEmit("ReadGroupSet '{}' added to dataset '{}'{}".format(
            fileName, datasetName, indexMessage))
Beispiel #27
0
def bwa_sampe(pe1_path, pe2_path, genome_path, output_path):
    print 'Aligning with bwa aln/sampe'
    start = time()
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")

    null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null)
    with open(pe1_output, "w") as pe1_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null)

    with open(pe2_output, "w") as pe2_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null)

    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "sampe",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout=bwa_file, stderr=null)

    elapsed = time() - start
    print 'Time elapsed for bwa aln/sampe: ', elapsed

    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
Beispiel #28
0
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path, args):
    work_dir = tempfile.mkdtemp( )
    genome_db = os.path.join( work_dir, "genome" )
    pe1_output = os.path.join( work_dir, "pe1.sai" )
    pe2_output = os.path.join( work_dir, "pe2.sai" )
    bwa_output = os.path.join( work_dir, "output.sam" )
    
    null = open( "/dev/null" ) #open("/tmp/bwa_out")#
    subprocess.check_call( [ "bwa", "index", "-p", genome_db, genome_path ], stderr = null )
    with open( pe1_output, "w" ) as pe1_file:
        subprocess.check_call( [ "bwa", "aln", genome_db, pe1_path ], stdout = pe1_file, stderr = null )
    
    with open( pe2_output, "w" ) as pe2_file:
        subprocess.check_call( [ "bwa", "aln", genome_db, pe2_path ], stdout = pe2_file, stderr = null )
    
    with open( bwa_output, "w" ) as bwa_file:
        subprocess.check_call( [ "bwa", "sampe",
                                "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout = bwa_file, stderr = null )
 


    if args.sam:
        shutil.move(bwa_output ,output_path+'.sam')
        #os.rename(bwa_output ,output_path+'.sam')
    else:
        sam_to_bam( bwa_output, bwa_output + ".bam" )
        if args.sort:
            # coordinate sort the file
            pysam.sort( bwa_output + ".bam", output_path )
            pysam.index(output_path+'.bam')
        else:
            shutil.move(bwa_output +".bam",output_path+'.bam')
Beispiel #29
0
 def convert(self):
     # set flags
     if self.inputFileFormat == AlignmentFileConstants.SAM:
         inputFlags = "r"
     elif self.inputFileFormat == AlignmentFileConstants.BAM:
         inputFlags = "rb"
     if self.outputFileFormat == AlignmentFileConstants.SAM:
         outputFlags = "wh"
     elif self.outputFileFormat == AlignmentFileConstants.BAM:
         outputFlags = "wb"
     # open files
     inputFile = pysam.AlignmentFile(
         self.args.inputFile, inputFlags)
     outputFile = pysam.AlignmentFile(
         self.args.outputFile, outputFlags, header=inputFile.header)
     outputFilePath = outputFile.filename
     log("Creating alignment file '{}'".format(outputFilePath))
     # write new file
     for _ in xrange(self.args.numLines):
         alignedSegment = inputFile.next()
         outputFile.write(alignedSegment)
     # clean up
     inputFile.close()
     outputFile.close()
     # create index file
     if (not self.args.skipIndexing and
             self.outputFileFormat == AlignmentFileConstants.BAM):
         indexFilePath = "{}.{}".format(
             outputFilePath, AlignmentFileConstants.BAI.lower())
         log("Creating index file '{}'".format(indexFilePath))
         pysam.index(outputFilePath)
 def _generate_empty_bam_file(self, sam_path, bam_path_prefix):
     samfile = pysam.Samfile(sam_path, "r")
     bamfile = pysam.Samfile(
         "%s.bam" % bam_path_prefix, "wb", header=samfile.header)
     bamfile.close()
     samfile.close()
     pysam.index("%s.bam" % bam_path_prefix)
Beispiel #31
0
 def build_index(self):
     msg = "Building index %s" % self.baifn
     print(msg)
     pysam.index(self.bamfn)
Beispiel #32
0
def main():
    parser = argparse.ArgumentParser(
        description='Count reads mapped to genes and exons')

    parser.add_argument('annotation', type=str, help='annotation file')
    parser.add_argument('-b',
                        '--inp',
                        type=str,
                        nargs='+',
                        help='input bam files')
    parser.add_argument('-o', '--out', type=str, help='output file')
    parser.add_argument('-n',
                        '--nproc',
                        type=int,
                        default=1,
                        help='number of processes')
    parser.add_argument('-l',
                        '--names',
                        type=str,
                        nargs='+',
                        help='input track names',
                        required=False,
                        default=[])
    parser.add_argument("-s", '--strand', help='strand', choices=['+', '-'])

    # group1 = parser.add_mutually_exclusive_group()
    # group1.add_argument('-e', action="store_true",
    #                help='count exon/intron hits')
    # group1.add_argument('-t', action="store_true",
    #                help='count transcript hits')
    # group1.add_argument('-g', action="store_true",
    #                help='count gene hits')
    #
    # group2 = parser.add_mutually_exclusive_group()
    # group2.add_argument('-i', action="store_true",
    #                help='only introns')
    # group2.add_argument('-e', action="store_true",
    #                help='only exons')
    # group2.add_argument('-t', action="store_true",
    #                help='all (begin-end)')

    parser.add_argument(
        '-f',
        '--feature',
        choices=('genes', 'transcripts', 'exons', 'introns', 'exonsnoagg'),
        required=True,
        help='Output one value per gene/transcript/exon/intron')
    parser.add_argument(
        '-i',
        '--include',
        choices=('all', 'exons', 'introns'),
        default='exons',
        help='For genes count only reads mapped to exons/introns/both.  \
                              Has no efect if feature==exon or intron ')

    args = parser.parse_args()

    if args.nproc < 1: args.nproc = 1

    if len(args.inp) > len(args.names):
        args.names += args.inp[len(args.names):]
    print "Counting samples named:", args.names

    # try to open before running
    filea = open(args.annotation, "r")
    filea.close()
    fileo = open(args.out, "w+")
    filesi = []
    for bam in args.inp:
        f = open(bam)
        f.close()
        name = args.names.pop(0)
        fileo.write("\t%s" % name)
        if not os.path.exists(bam + ".bai"):
            print "Indexing bam file %s" % bam
            pysam.index(bam)
        filesi += [(bam, name)]
    fileo.write("\n")

    counttype = (args.feature, args.include)
    count(args.annotation, filesi, fileo, args.strand, counttype, args.nproc)

    fileo.close()
Beispiel #33
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Bams need to be filtered by UMI and MAPQ before we do anything...')
    parser.add_argument('infile',
                        metavar='INFILE',
                        help='bam of a single cell')
    parser.add_argument('outfile',
                        metavar='OUTFILE',
                        help='bam filtered by UMI and MAPQ')
    parser.add_argument(
        'outfilesorted',
        metavar='OUTFILE_SORTED',
        help='bam filtered by UMI and MAPQ, sorted and indexed')
    parser.add_argument('--quiet',
                        '-q',
                        action='store_true',
                        help='Suppress some print statements')
    parser.add_argument('--logfile',
                        '-l',
                        metavar='LOGFILE',
                        default=None,
                        help='Write arguments to logfile')
    parser.add_argument(
        '--mapq_thres',
        '-m',
        type=int,
        default=30,
        help='Minimum mapping quality. Default 30. bwa output is 0-60?')
    parser.add_argument(
        '--umi_window',
        '-w',
        type=int,
        default=1000,
        help=
        'When checking UMI, throw out read if things have been read within umi_window basepairs away. Default 1000 to handle paired end?'
    )
    parser.add_argument(
        '--umi_indx',
        '-i',
        type=int,
        default=16,
        help=
        'umi indx to get from header. Assumes header format of semi-colon separated followed by colon separated of 2 terms'
    )
    parser.add_argument('--bc_indx',
                        '-I',
                        type=int,
                        default=13,
                        help='bc indx to get from header')
    parser.add_argument('--dumpfile',
                        '-d',
                        metavar='DUMPBAM',
                        default=None,
                        help='bam output of UMI duplicates')
    parser.add_argument('--no_prefix',
                        '-p',
                        action='store_true',
                        help='If set, then remove chr from chromosome names')
    parser.add_argument('--ignore_truncation',
                        action='store_true',
                        help='If set, then ignore truncation')
    parser.add_argument('--add_prefix',
                        action='store_true',
                        help='Add chr to chromosome names')
    parser.add_argument('--debug', action='store_true', help='Debug')
    parser.add_argument('--genome',
                        '-g',
                        metavar='mm or hs',
                        default='mm',
                        help='Genome is mm or hs to define chromosomes')
    args = parser.parse_args()

    # store command line arguments for reproducibility
    CMD_INPUTS = ' '.join(['python'] + sys.argv)  # easy printing later
    # store argparse inputs for reproducibility / debugging purposes
    args_dic = vars(args)
    # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()]  # for python2
    ARG_INPUTS = ['%s=%s' % (key, val)
                  for key, val in args_dic.items()]  # for python3
    ARG_INPUTS = ' '.join(ARG_INPUTS)

    # chromos = [''.join(['chr', str(i + 1)]) for i in range(20)] + ['chrX', 'chrY', 'chrM']
    if not args.no_prefix:
        chromos = [''.join(['chr', str(i + 1)])
                   for i in range(22)] + ['chrX', 'chrY', 'chrM']
    else:
        chromos = [''.join(['', str(i + 1)])
                   for i in range(22)] + ['X', 'Y', 'M']

    chromos_set = set(chromos)
    bad_chromos = set()
    print(chromos)
    umi_dic = {}  # UMIs are
    umi_dic_pos = {}  # UMIs positions
    umi_dic_bin = {}  # UMIs by bins, by chromosome
    badreads = 0
    badreadsumi = 0
    goodcounts = 0
    badchromo = 0
    with pysam.AlignmentFile(args.infile,
                             "rb",
                             ignore_truncation=args.ignore_truncation) as bamf:
        if args.add_prefix:
            bamf.references = [''.join(['chr', x]) for x in bamf.references]
        with pysam.AlignmentFile(args.outfile, "wb", template=bamf) as outbam:
            if args.dumpfile is not None:
                dumpbam = pysam.AlignmentFile(args.dumpfile,
                                              "wb",
                                              template=bamf)
            for totalcounts, read in enumerate(bamf):
                if read.mapping_quality < args.mapq_thres:
                    # throw out bad reads
                    badreads += 1
                    continue
                # get UMI-Barcode
                # umibc = get_umibc(read)
                umibc = get_umibc_longheader(read)
                if (args.debug):
                    print(umibc)
                    input("press any key")
                chromo = read.reference_name
                if chromo not in chromos_set:
                    # throw out bad chromo
                    bad_chromos.add(chromo)
                    badchromo += 1
                    continue
                pos = read.reference_start  # left most pos, 0-based
                coord = ':'.join([chromo, str(pos)])
                # get bin within 1kb
                bin = coord_to_bin(pos)
                end = get_end(
                    read
                )  # Positive or Negative depends on fragment from paired end
                if umibc not in umi_dic:
                    # initialize R1 and R2
                    umi_dic[umibc] = {
                        'R1': 0,
                        'R2': 0
                    }  # keep track of reads, they are also indexes!
                    # umi_dic_pos[umibc] = {'R1': [], 'R2' : []}  # keep track of positions
                    umi_dic_bin[umibc] = {
                        'R1': {},
                        'R2': {}
                    }  # track bins for UMI counting
                    for c in chromos:
                        for end in ['R1', 'R2']:
                            umi_dic_bin[umibc][end][c] = set()
                else:
                    # update umi_dics
                    # if umibc counted in same bin in same end (R1 or R2) then it's bad
                    if bin in umi_dic_bin[umibc][get_end(read)][chromo]:
                        # umi_dic_pos[umibc][get_end(read)].append(coord)  # record the bad read
                        umi_dic[umibc][get_end(
                            read)] += 1  # record duplicate reads
                        # already in bin, then don't add it
                        badreadsumi += 1
                        dumpbam.write(read)
                        continue
                # reads here are unique (within a window) and high quality, write them to outbam and move on
                # umi_dic_pos[umibc][get_end(read)].append(coord)  # only append if it's a bad read
                umi_dic_bin[umibc][get_end(read)][chromo].add(bin)
                if (args.debug):
                    print("Good read, adding to dictionary")
                goodcounts += 1
                outbam.write(read)
            if args.dumpfile is not None:
                dumpbam.close()
    # sort and index bam
    pysam.sort('-o', args.outfilesorted, args.outfile)
    pysam.index(args.outfilesorted)
    # remove temporarily file
    os.remove(args.outfile)

    # Print arguments supplied by user. Ideally as log file because of the \n
    if not args.quiet:
        if args.logfile is not None:
            sys.stdout = open(args.logfile, "w+")
        print(datetime.datetime.now().strftime('Code output on %c'))
        print('\n')
        print('Command line inputs:')
        print('\n')
        print(CMD_INPUTS)
        print('\n')
        print('Argparse variables:')
        print('\n')
        print(ARG_INPUTS)
        print('\n')
        # list total counts
        print("Reads from these chromosomes thrown out: %s" % bad_chromos)
        print("Total counts: %s" % totalcounts)
        print('\n')
        print("High quality, unique counts: %s" % goodcounts)
        print('\n')
        print("Bad quality counts: %s" % badreads)
        print('\n')
        # write duplicate UMI counts as a table
        for umibc in umi_dic:
            if umi_dic[umibc]['R1'] > 1 and umi_dic[umibc]['R2'] > 1:
                # print('%s\t%s\t%s\t%s\%s\n' %\
                #         (umibc, umi_dic[umibc]['R1'],
                #             umi_dic[umibc]['R2'],
                #             ','.join(umi_dic_pos[umibc]['R1']),
                #             ','.join(umi_dic_pos[umibc]['R2'])))
                print('%s\t%s\t%s\n' %\
                        (umibc, umi_dic[umibc]['R1'],
                            umi_dic[umibc]['R2']))
Beispiel #34
0
def vcf_from_fasta(args):
    """Entry point for calling variants by consensus sequence alignment."""
    logger = medaka.common.get_named_logger('CONS2VCF')

    with pysam.FastaFile(args.ref_fasta) as fasta:
        ref_seqs = {name: fasta.fetch(name) for name in fasta.references}
        contig_lengths = dict(zip(fasta.references, fasta.lengths))
        total_bp = sum(fasta.lengths)
        ref_contigs = fasta.references
        h = pysam.AlignmentHeader().from_references(fasta.references,
                                                    fasta.lengths)

    if args.bam is not None:
        alns = pysam.AlignmentFile(args.bam)
        out_bam = None
    else:
        out_bam = pysam.AlignmentFile(args.out_prefix + '.bam', 'wb', header=h)
        if args.regions is not None:
            contigs = [r.ref_name for r in args.regions]
        else:
            contigs = None
        alns = edlib_chunked_align_fastas(args.consensus,
                                          args.ref_fasta,
                                          contigs,
                                          chunk_size=args.chunk_size,
                                          pad=args.pad,
                                          mode=args.mode,
                                          header=h)
    vcf_fp = args.out_prefix + '.vcf'
    trees = collections.defaultdict(intervaltree.IntervalTree)
    t_log = now()
    log_interval = 5
    msg = 'Processed {:.2%} of reference.'
    bp_done = collections.Counter()

    header_contigs = [
        '{},length={}'.format(c, contig_lengths[c]) for c in ref_contigs
    ]
    meta_info = [
        medaka.vcf.MetaInfo('FORMAT', 'GT', 1, 'String', 'Medaka genotype.')
    ]
    with medaka.vcf.VCFWriter(vcf_fp,
                              contigs=header_contigs,
                              meta_info=meta_info) as writer:
        for aln in alns:
            # reference_start is 0 based, reference_end points to one past
            # the last aligned residue, i.e. same as bed file
            ref = aln.reference_name
            rstart, rend = aln.reference_start, aln.reference_end
            if trees[ref].overlaps(rstart, rend) and args.bam is not None:
                # We expect edlib alignments to overlap by 1 match so only
                # apply this check for a user-provided bam.
                logger.warning(
                    ('WARNING: alignment {}:{}-{} overlaps another ' +
                     'alignment, which could cause overlapping variants.' +
                     '\nCheck output bam and vcf for details.').format(
                         ref, rstart, rend))
            trees[ref].add(intervaltree.Interval(rstart, rend))
            for v in yield_variants_from_aln(aln, ref_seqs[ref]):
                if 'N' in v.ref:
                    continue
                writer.write_variant(v)
                if now() - t_log > log_interval:
                    done = bp_done[ref] + v.pos - rstart
                    logger.info(msg.format(done / total_bp))
                    t_log = now()
            bp_done[ref] += rend - rstart
            if out_bam is not None:
                out_bam.write(aln)

    if out_bam is not None:
        out_bam.close()
        pysam.index(out_bam.filename)

    bed_fp = args.out_prefix + '_coverage.bed'
    gap_bed_fp = args.out_prefix + '_coverage_gaps.bed'
    for tree in trees.values():
        # strict=False to merge abutting alignments.
        tree.merge_overlaps(strict=False)
    medaka.common.write_intervaltrees_to_bed(trees, bed_fp)
    gap_trees = medaka.common.complement_intervaltrees(trees, contig_lengths)
    medaka.common.write_intervaltrees_to_bed(gap_trees, gap_bed_fp)
    # loop over contigs for which we have alignments checking for gaps
    for contig in trees:
        if len(gap_trees[contig]):
            logger.info(('WARNING: There are alignment gaps for ref contig' +
                         ' {}, see bed files for details.').format(contig))
    if len(ref_contigs) != len(trees):
        logger.info('WARNING: Some contigs have no alignments, see bed files' +
                    ' for details.')
    # bp_done calculated above does not take account of overlapping alignments
    # hence recalculate here based on merged alignment intervals.
    aligned_bp = sum((i.length() for tree in trees.values() for i in tree))
    msg = 'Alignments spanned {:%} of the reference.'
    logger.info(msg.format(aligned_bp / total_bp))
    msg = 'Check bed files {} and {} for alignment coverage and gaps.'
    logger.info(msg.format(bed_fp, gap_bed_fp))
    logger.info('All done. VCF written to {}.'.format(vcf_fp))
Beispiel #35
0
def index_bam(bam_fpath):
    'It indexes a bam file'
    pysam.index(bam_fpath)
Beispiel #36
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [
        binom.isf(1e-7, F_gc[x], 1.0 /
                  N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ]

    global_vars['max_dup_gc'] = max_dup_gc

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = pysam.Samfile(global_vars['bam'])

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print "applying correction"
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print "genome partition size for multiprocessing: {}".format(chunkSize)
    print "using region {}".format(args.region)
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    print chrNameBitToBam, chrNameBamToBit
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in xrange(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print "no sequence information for "
                "chromosome {} in 2bit file".format(chrom)
                print "Reads in this chromosome will be skipped"
                continue
            length = min(size, i + chunkSize)
            mp_args.append(
                (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print("using {} processors for {} "
                  "number of tasks".format(args.numberOfProcessors,
                                           len(mp_args)))

            res = pool.map_async(writeCorrectedSam_wrapper,
                                 mp_args).get(9999999)
        else:
            res = map(writeCorrectedSam_wrapper, mp_args)

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print "concatenating (sorted) intermediate BAMs"
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print "indexing BAM"
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg')
        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = map(writeCorrected_wrapper, mp_args)

        # concatenate intermediary bedgraph files
        _temp_bg_file = open(_temp_bg_file_name, 'w')
        for tempFileName in res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file)
                os.remove(tempFileName)
        _temp_bg_file.close()
        args.correctedFile.close()

        if args.correctedFile.name.endswith('bg'):
            shutil.move(_temp_bg_file_name, args.correctedFile.name)

        else:
            chromSizes = [(x, bit[x].size) for x in bit.keys()]
            writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name,
                                           args.correctedFile.name)
            os.remove(_temp_bg_file)
Beispiel #37
0
import sys
import re
import itertools
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO
import pysam
import runCmd

if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO

if not os.path.exists(args.bam + "bai"):
    pysam.index(args.bam, args.bam + ".bai")
bamfile = pysam.AlignmentFile(args.bam, "rb")
topLen = pysam.AlignmentFile(args.outbam, "wb", template=bamfile)


lengths = []
for read in bamfile.fetch():
    lengths.append(read.infer_query_length())

lengths = np.sort(np.array(lengths))
# print a histogram of lengths
length = len(lengths)
for i in range(1, 11):
    posStart = (i - 1) * length / 10
    posEnd = i * length / 10 - 1
    print(
Beispiel #38
0
def main(mode, input, output, name, ncores, bowtie2_index, cluster, jobs,
         peaks_file, by_rgid, peak_width, keep_duplicates, max_javamem,
         trash_mito, reference_genome, very_sensitive, clipl, clipr, py_trim,
         keep_temp_files, skip_fastqc, overwrite, bedtools_genome,
         blacklist_file, tss_file, macs2_genome_size, bs_genome, bedtools_path,
         bowtie2_path, java_path, macs2_path, samtools_path, r_path):
    """
	proatac: a toolkit for PROcessing ATAC-seq data. \n
	Caleb Lareau, Buenrostro Lab. \n
	modes = ['bulk', 'check', 'counts', 'indexSplit', 'single', 'summitsToPeaks', 'support']\n
	See http://proatac.readthedocs.io for more details.
	"""

    __version__ = get_distribution('proatac').version
    script_dir = os.path.dirname(os.path.realpath(__file__))

    click.echo(gettime() + "Starting proatac pipeline v%s" % __version__)

    # Determine which genomes are available
    rawsg = os.popen('ls ' + script_dir +
                     "/anno/bedtools/*.sizes").read().strip().split("\n")
    supported_genomes = [
        x.replace(script_dir + "/anno/bedtools/chrom_",
                  "").replace(".sizes", "") for x in rawsg
    ]

    if (mode == "support"):
        click.echo(gettime() +
                   "List of built-in genomes supported in proatac:")
        click.echo(gettime() + str(supported_genomes))
        sys.exit(
            gettime() +
            'Specify one of these genomes or provide your own files (see documentation).'
        )

    # Take a collection of summits files and return a consensus set of peaks
    if (mode == 'summitsToPeaks'):
        click.echo(gettime() + "Starting inference of peaks from summits.")

        # Need chromosome sizes and blacklist
        bedtoolsGenomeFile, blacklistFile = getBfiles(bedtools_genome,
                                                      blacklist_file,
                                                      reference_genome,
                                                      script_dir,
                                                      supported_genomes)

        # Figure out which samples to process
        bedFiles = os.popen("ls " + input.rstrip("/") +
                            "/*summits.bed*").read().strip().split("\n")
        if (len(bedFiles) < 1):
            sys.exist("No summit *summits.bed* files found; QUITTING")
        else:
            click.echo(gettime() + "Calling peaks from these samples:")
            click.echo(gettime() + str(bedFiles))

        # Verify dependencies
        R = get_software_path('R', r_path)
        check_R_packages(['data.table', 'GenomicRanges', 'tools'], R)

        # Execute software
        make_folder(output)
        summitRcall = " ".join([
            R + "script", script_dir + "/bin/R/summitsToCleanPeaks.R",
            ",".join(bedFiles), peak_width, blacklistFile, bedtoolsGenomeFile,
            str(999999999),
            str(0.01), output, name
        ])
        os.system(summitRcall)
        click.echo(gettime() + "Completed peak inference from summit files.")
        sys.exit()

    # TO DO:
    # Make a mode to handle split-pool data
    if (mode == 'indexSplit'):
        sys.exit("Mode does not actually work yet")

    # Make a counts table from user-supplied peaks and bam files
    if (mode == 'counts'):
        click.echo(
            gettime() +
            "Attempting to assemble counts table from user-specified input.")

        # Verify dependencies
        R = get_software_path('R', r_path)
        check_R_packages(['chromVAR', 'SummarizedExperiment', 'tools'], R)

        # Make sure that there are samples to process / there is a peak file
        bamfiles = os.popen("ls " + input.rstrip("/") +
                            "/*.bam").read().strip().split("\n")
        if (len(bamfiles) < 1):
            sys.exist(
                "No sample *.bam files found in user-specified input; QUITTING"
            )
        else:
            click.echo(gettime() + "Making a counts table from these samples:")
            click.echo(gettime() + str(bamfiles))
        if (os.path.isfile(peaks_file)):
            click.echo(gettime() + "Found peaks file: " + peaks_file)

        # Execute software
        make_folder(output)
        countsRcall = " ".join([
            R + "script", script_dir + "/bin/R/makeCountsTable.R", input,
            peaks_file,
            str(by_rgid), output, name
        ])
        os.system(countsRcall)
        click.echo(gettime() + "Completed peak inference from summit files.")
        sys.exit()

    # Last minute changes
    if (very_sensitive):
        very_sensitive = "--very-sensitive "
    else:
        very_sensitive = ""

    p = proatacProject(
        script_dir, supported_genomes, mode, input, output, name, ncores,
        bowtie2_index, cluster, jobs, peak_width, keep_duplicates, max_javamem,
        trash_mito, reference_genome, very_sensitive, clipl, clipr, py_trim,
        keep_temp_files, skip_fastqc, overwrite, bedtools_genome,
        blacklist_file, tss_file, macs2_genome_size, bs_genome, bedtools_path,
        bowtie2_path, java_path, macs2_path, samtools_path, r_path)

    if (mode == "check"):
        click.echo(gettime() + "Dependencies and user-reported file paths OK")
        click.echo(
            "\nproatac will process the following samples / files with bulk / single specified: \n"
        )
        print("Sample", "Fastq1", "Fastq2")
        for x in range(len(p.samples)):
            print(p.samples[x], p.fastq1[x], p.fastq2[x])
        click.echo(
            "\nIf this table doesn't look right, consider specifying a manually created sample input table (see documentation).\n"
        )
        sys.exit(gettime() + "Successful check complete; QUITTING.")

    # Single or bulk processing
    if (mode == "single" or mode == "bulk"):

        # Potentially submit jobs to cluster
        if (ncores == "detect"):
            ncores = str(available_cpu_count())
        else:
            ncores = str(ncores)

        snakeclust = ""
        njobs = int(jobs)
        if (njobs > 0 and cluster != ""):
            snakeclust = " --jobs " + jobs + " --cluster '" + cluster + "' "
            click.echo(
                gettime() +
                "Recognized flags to process jobs on a computing cluster.")

        # Make output folders
        of = output
        logs = of + "/logs"
        fin = of + "/final"
        trim = of + "/01_trimmed"
        aligned = of + "/02_aligned_reads"
        processed = of + "/03_processed_reads"
        qc = of + "/04_qc"

        folders = [
            of, logs, fin, trim, aligned, processed, qc,
            of + "/.internal/parseltongue", of + "/.internal/samples",
            logs + "/bowtie2", logs + "/trim", logs + "/macs2",
            of + "/03_processed_reads/temp", fin + "/plots"
        ]

        mkfolderout = [make_folder(x) for x in folders]

        make_folder(logs + "/picard")
        make_folder(logs + "/picard/inserts")
        make_folder(logs + "/tss")
        make_folder(logs + "/samples")
        make_folder(of + "/mito")

        if not keep_duplicates:
            make_folder(logs + "/picard/markdups")
        if not skip_fastqc:
            make_folder(logs + "/fastqc")

        if (mode == "bulk"):
            make_folder(of + "/final/bams")
            make_folder(of + "/final/summits")
            make_folder(of + "/04_qc/macs2_each")
        if (mode == "single"):
            make_folder(of + "/03_processed_reads/bams")

        # Create internal README files
        if not os.path.exists(of + "/.internal/README"):
            with open(of + "/.internal/README", 'w') as outfile:
                outfile.write(
                    "This folder creates important (small) intermediate; don't modify it.\n\n"
                )
        if not os.path.exists(of + "/.internal/parseltongue/README"):
            with open(of + "/.internal/parseltongue/README", 'w') as outfile:
                outfile.write(
                    "This folder creates intermediate output to be interpreted by Snakemake; don't modify it.\n\n"
                )
        if not os.path.exists(of + "/.internal/samples/README"):
            with open(of + "/.internal" + "/samples" + "/README",
                      'w') as outfile:
                outfile.write(
                    "This folder creates samples to be interpreted by Snakemake; don't modify it.\n\n"
                )

        # Create promoter file:
        ptss = of + "/.internal/promoter.tss.bed"
        if not os.path.exists(ptss):
            os.system('''awk '{print $1"\t"$2-2000"\t"$3+2000"\t"$4}' ''' +
                      p.tssFile + " > " + ptss)

        # Set up sample bam plain text file
        for i in range(len(p.samples)):
            with open(
                    of + "/.internal/samples/" + p.samples[i] + ".fastqs.txt",
                    'w') as outfile:
                outfile.write(p.fastq1[i] + "\t" + p.fastq2[i])

        y_s = of + "/.internal/parseltongue/proatac.object.yaml"
        with open(y_s, 'w') as yaml_file:
            yaml.dump(dict(p),
                      yaml_file,
                      default_flow_style=False,
                      Dumper=yaml.RoundTripDumper)

        snakecmd_scatter = 'snakemake' + snakeclust + ' --snakefile ' + script_dir + '/bin/snake/Snakefile.proatac.scatter --cores ' + ncores + ' --config cfp="' + y_s + '" '
        os.system(snakecmd_scatter)

        if (mode == 'single'):

            # Merge into one .bam file:
            finalmergedbam = fin + "/" + p.name + ".merged.bam"
            if not os.path.isfile(finalmergedbam):
                os.system(p.samtools + " merge " + finalmergedbam + " " + of +
                          "/03_processed_reads/bams/*.bam")
                pysam.index(finalmergedbam)

        snakecmd_gather = 'snakemake --snakefile ' + script_dir + '/bin/snake/Snakefile.proatac.gather --cores ' + ncores + ' --config cfp="' + y_s + '" '
        os.system(snakecmd_gather)

        if keep_temp_files:
            click.echo(
                gettime() +
                "Temporary files not deleted since --keep-temp-files was specified."
            )
        else:
            if (mode == "bulk" or mode == "single"):
                byefolder = of

            shutil.rmtree(byefolder + "/.internal")
            shutil.rmtree(byefolder + "/01_trimmed")
            shutil.rmtree(byefolder + "/02_aligned_reads")
            shutil.rmtree(byefolder + "/03_processed_reads")
            shutil.rmtree(byefolder + "/04_qc")
            if (trash_mito):
                shutil.rmtree(byefolder + "/mito")

            click.echo(gettime() + "Intermediate files successfully removed.")

    click.echo(gettime() + "Complete.")
Beispiel #39
0
def filter_bam_multihits(filename,
                         max_tags,
                         max_hits,
                         out_dir,
                         read_tagger_method,
                         omit_detail=False):
    """Pre-processing function for cleaning up the input bam file.
	Args:
	Returns:
	"""
    # logging the parameter values
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    msg = 'Params:\n'
    for i in args:
        msg += "%s = %s \n" % (i, values[i])
    logger.info(msg)
    read_tagger = lambda x: read_tagger_collection(x,
                                                   method=read_tagger_method)
    logger.info('filtering input bam')

    in_bam = pysam.AlignmentFile(filename, 'rb')
    # unique read bam
    ubam_fn = os.path.join(out_dir, 'unique.bam')
    sorted_ubam_fn = os.path.join(out_dir, 'unique.sorted.bam')
    ubam = pysam.AlignmentFile(ubam_fn, 'wb', template=in_bam)
    unique_counter = 0

    # multi-read bam
    mbam_fn = os.path.join(out_dir, 'multi.bam')
    sorted_mbam_fn = os.path.join(out_dir, 'multi.sorted.bam')
    mbam = pysam.AlignmentFile(mbam_fn, 'wb', template=in_bam)
    mread_set = set()

    # do not omit sequences if to filter max_tags
    if max_tags > 0:
        omit_detail = False

    # splitting unique and multi- reads
    # and add the read taggers we need
    if not \
     (os.path.isfile( os.path.join(out_dir,'unique.sorted.bam') ) and \
     os.path.isfile( os.path.join(out_dir,'multi.sorted.bam')) ):

        #for read in tqdm(in_bam):
        counter = 0
        for read in in_bam:
            # poor man's progress bar
            counter += 1
            if not counter % 10**6:
                logger.debug('tagged %i alignments' % counter)
            read_tag = read_tagger(read)
            ## skip reads with unassigned tagger
            if read_tag == -1:
                continue
            read.tags += [('RT', read_tag)]  ## add the tag

            ## omit the details in read sequence and quality
            ## recommended for larger bam because this
            ## can save some memory/storage for large bams
            if omit_detail:
                read.query_sequence = '*'
                read.query_qualities = '0'

            if read.is_secondary or (read.has_tag('NH')
                                     and read.opt("NH") > 1):
                try:
                    if read.opt("NH") < max_hits:
                        mbam.write(read)
                        mread_set.add(read.qname)
                except KeyError:
                    #print read
                    raise Exception('%s: missing NH tag when is_secondary=%s' %
                                    (read.qname, read.is_secondary))
            else:
                ubam.write(read)
                unique_counter += 1

        ubam.close()
        mbam.close()

        # sorting
        pysam.sort('-m', '4G', '-@', '3', '-T',
                   os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn,
                   ubam_fn)
        os.remove(ubam_fn)
        pysam.sort('-m', '4G', '-@', '3', '-T',
                   os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn,
                   mbam_fn)
        os.remove(mbam_fn)
        pysam.index(sorted_ubam_fn)
        pysam.index(sorted_mbam_fn)

        # log the statistics
        multi_counter = len(mread_set)
        logger.info(
          'Unique reads = %s;  ' % unique_counter + \
          'Multi reads = %s (%.2f %%)' % \
          ( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 )
         )
    else:
        logger.info(
            'found previously sorted tag-bam. checking if need collapsing.')

    # filter redundant tags if turned on
    if max_tags > 0:
        logger.info('collapsing unique')
        filter_bam_maxtags(
            os.path.join(out_dir, 'unique.sorted.collapsed.bam'),
            os.path.join(out_dir, 'unique.sorted.bam'), max_tags)
        logger.info('collapsing multi')
        filter_bam_maxtags(os.path.join(out_dir, 'multi.sorted.collapsed.bam'),
                           os.path.join(out_dir, 'multi.sorted.bam'), max_tags)

    in_bam.close()
    return
Beispiel #40
0
def main():
    usage = "%prog [options]" + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--input-file",
                      action="store",
                      type="string",
                      dest="input_file",
                      help="Input alignment file in SAM or BAM format")
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="refgene_bed",
                      help="Reference gene model in bed fomat.")
    parser.add_option(
        "-s",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=200000,
        help="Number of reads sampled from SAM/BAM file. default=%default")
    parser.add_option(
        "-q",
        "--mapq",
        action="store",
        type="int",
        dest="map_qual",
        default=30,
        help=
        "Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default"
    )
    parser.add_option("-o",
                      "--out",
                      action="store",
                      type="string",
                      dest="output_file",
                      default="infer_result",
                      help=" default=%infer_result")

    (options, args) = parser.parse_args()

    if not (options.input_file and options.refgene_bed):
        parser.print_help()
        print('\n\n' + __doc__, file=sys.stderr)
        sys.exit(0)
    for f in (options.input_file, options.refgene_bed):
        if not os.path.exists(f):
            print('\n\n' + f + " does NOT exists." + '\n', file=sys.stderr)
            sys.exit(0)
    if options.sample_size < 1000:
        print("Warn: Sample Size too small to give a accurate estimation",
              file=sys.stderr)
    pysam.index(options.input_file)
    obj = SAM.ParseBAM(options.input_file)
    (protocol, sp1, sp2,
     other) = obj.configure_experiment(refbed=options.refgene_bed,
                                       sample_size=options.sample_size,
                                       q_cut=options.map_qual)
    if other < 0: other = 0.0
    file_object = open(options.output_file + ".txt", "w")
    if protocol == "PairEnd":
        file_object.write("This is Paired End Data\n")
        file_object.write("Fraction of reads failed to determine: %.4f" %
                          other + "\n")
        file_object.write(
            "Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1 +
            "\n")
        file_object.write(
            "Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2 +
            "\n")
        if sp1 > 2 * sp2:
            file_object.write(
                "\nExperiment is likely \"1++,1--,2+-,2-+\" (HTSeq.count --forward)\n"
            )
        if sp2 > 2 * sp1:
            file_object.write(
                "\nExperiment is likely \"1+-,1-+,2++,2--\" (HTSeq.count --reverse)\n"
            )

    elif protocol == "SingleEnd":
        file_object.write("This is Single End Data\n")
        file_object.write("Fraction of reads failed to determine: %.4f" %
                          other + "\n")
        file_object.write("Fraction of reads explained by \"++,--\": %.4f" %
                          sp1 + "\n")
        file_object.write("Fraction of reads explained by \"+-,-+\": %.4f" %
                          sp2 + "\n")
        if sp1 > 2 * sp2:
            file_object.write(
                "\nExperiment is likely \"++,--\" (HTSeq.count --forward)\n")
        if sp2 > 2 * sp1:
            file_object.write(
                "\nExperiment is likely \"+-,-+\" (HTSeq.count --reverse)\n")

    else:
        file_object.write("Unknown Data type\n")
    #print mesg
    file_object.close()
Beispiel #41
0
 def __init__(self,
              input_file: str = None,
              genome_database: str = None,
              output_prefix: str = None,
              ignore_overlap: bool = True,
              text_output: bool = False,
              remove_ccgg: bool = False,
              min_read_depth: int = 10,
              max_read_depth: int = 8000,
              threads: int = 1,
              verbose: bool = True,
              min_base_quality: int = 10,
              min_mapping_quality: int = 10,
              ATCGmap: bool = False,
              cg_only: bool = True,
              ignore_orphans: bool = False,
              bedgraph_output: bool = False):
     assert isinstance(input_file, str), 'Path to input file not valid'
     assert isinstance(text_output, bool), 'Not valid bool'
     assert isinstance(threads,
                       int), 'Threads must be specified with integer'
     if output_prefix:
         assert isinstance(output_prefix, str)
     self.input_file = input_file
     try:
         self.input_bam = pysam.Samfile(input_file,
                                        'rb',
                                        require_index=True)
     except IOError:
         print('Generating Index File')
         pysam.index(input_file)
         self.input_bam = pysam.Samfile(input_file,
                                        'rb',
                                        require_index=True)
     self.text_output = text_output
     self.output_prefix = output_prefix
     self.threads = threads
     self.call_methylation_kwargs = dict(
         input_file=input_file,
         genome_database=genome_database,
         ignore_overlap=ignore_overlap,
         ignore_orphans=ignore_orphans,
         remove_ccgg=remove_ccgg,
         max_read_depth=max_read_depth,
         min_base_quality=min_base_quality,
         min_mapping_quality=min_mapping_quality,
         cg_only=cg_only)
     self.min_read_depth = min_read_depth
     self.ATCGmap = ATCGmap
     self.bedgraph_output = bedgraph_output
     self.methylation_calling = True
     self.contigs = self.get_contigs
     self.completed_contigs = None
     self.return_queue = None
     self.pool = None
     self.verbose = verbose
     self.output_objects = self.get_output_objects
     self.methylation_stats = {
         'CG_meth': 0,
         'CG_all': 0,
         'CH_meth': 0,
         'CH_all': 0
     }
Beispiel #42
0
temp_bam1 = outputbam.replace(".qc.bam",
                              ".temp1.bam").replace("/temp/ready_bam/",
                                                    "/temp/temp_bam/")
prefixSM = outdir + "/temp/sparse_matrices/" + sample
outputdepth = outdir + "/qc/depth/" + sample + ".depth.txt"

# 1) Filter bam files
pycall = " ".join([
    python, filtclip_py, inputbam, filtlog, mito_genome, proper_paired, NHmax,
    NMmax
]) + " > " + temp_bam0
os.system(pycall)

# 2) Sort the filtered bam file
pysam.sort("-o", temp_bam1, temp_bam0)
pysam.index(temp_bam1)

# See if we have UMIs
if (umi_barcode != "" and len(umi_barcode) == 2):
    umi_extra = " BARCODE_TAG=" + umi_barcode
else:
    umi_extra = ""

# 3) (Optional) Remove duplicates
if (remove_duplicates == "True"):
    mdc_long = picardCall + " I=" + temp_bam1 + " O=" + outputbam + " M=" + rmlog + " REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=SILENT QUIET=true VERBOSITY=ERROR USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" + umi_extra
    proc = subprocess.Popen(mdc_long,
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            shell=True)
    out, err = proc.communicate()  # Read data from stdout and stderr
Beispiel #43
0
def generate_index_if_needed(filepath):
    index_file = os.path.abspath(filepath) + '.bai'
    if not os.path.isfile(index_file):
        # Index file doesn't exist; generate it
        pysam.index(filepath, index_file)
    return True
Beispiel #44
0
def main():

    parser = argparse.ArgumentParser(description='Run MuTect2.')

    ### Required
    parser.add_argument('mutect2', help='Path to MuTect2.')
    parser.add_argument('ref_genome', help='Path to the reference genome.')
    parser.add_argument('input_bam', help='Input BAM to process.')
    parser.add_argument('output_vcf', help='Output VCF.')
    parser.add_argument('log_file', help='Logging goes here.')

    ### Arguments specifying input resource files.
    parser.add_argument(
        '--intervals',
        help=
        'File of genomic coordinate intervals to call variants against of form <chrom:start-stop>.'
    )
    parser.add_argument(
        '--interval_str',
        help=
        'Genomic coordinate intervals to call variants against of form <chrom:start-stop>.'
    )
    parser.add_argument(
        '--panel_of_normals',
        help='Panel of normals to compare tumor calls against.')
    parser.add_argument('--dbsnp', help='dbSNP file')
    parser.add_argument('--cosmic', help='VCF file of COSMIC sites')

    ### Arguments specifying integer parameter values.
    parser.add_argument('--interval_padding',
                        help='Amount of padding in bp to add to each interval')
    # parser.add_argument('--stand_call_conf', help='The minimum phred-scaled confidence threshold at which variants should be called')
    # parser.add_argument('--stand_emit_conf', help='The minimum phred-scaled confidence threshold at which variants should be emitted')
    parser.add_argument(
        '--min_base_quality_score',
        help='Minimum base quality required to consider a base for calling')
    parser.add_argument('--initial_tumor_lod',
                        help='Initial LOD threshold for calling tumor variant')
    parser.add_argument(
        '--initial_normal_lod',
        help='Initial LOD threshold for calling normal variant')
    parser.add_argument('--tumor_lod',
                        help='LOD threshold for calling tumor variant')
    parser.add_argument('--normal_lod',
                        help='LOD threshold for calling normal non-germline')
    parser.add_argument(
        '--dbsnp_normal_lod',
        help='LOD threshold for calling normal non-variant at dbsnp sites')
    parser.add_argument(
        '--active_region_in',
        help='Use this interval list file as the active regions to process')
    ### Arguments not unique to MuTect2.
    parser.add_argument('--min_pruning',
                        help='Minimum support to not prune paths in the graph')
    parser.add_argument(
        '--min_dangling_branch_length',
        help='Minimum length of a dangling branch to attempt recovery')
    ## JHL Added 12/03/16
    parser.add_argument(
        '--kmerSize', help='Kmer size to use in the read threading assembler')
    parser.add_argument(
        '--downsampling_type',
        help=
        'Type of read downsampling to employ at a given locus [NONE, ALL_READS, BY_SAMPLE]'
    )
    parser.add_argument('--max_alternate_alleles',
                        help='Maximum number of alternate alleles to genotype')
    parser.add_argument(
        '--dontUseSoftClippedBases',
        action='store_true',
        help='If specified, we will not analyze soft clipped bases in the reads'
    )

    args = parser.parse_args()

    logging.basicConfig(filename=args.log_file, level=logging.INFO)
    work_bam = "input.bam"
    work_bai = "input.bam.bai"

    logging.info("Symlink " + args.input_bam + " with " + work_bam)

    if os.path.isfile(work_bam):
        os.remove(work_bam)
    if os.path.isfile(work_bai):
        os.remove(work_bai)

    os.symlink(args.input_bam, work_bam)
    logging.info("Indexing " + work_bam)
    pysam.index(work_bam)

    if args.intervals:
        logging.info("Including interval list in working directory.")
        shutil.copyfile(args.intervals, "intervals.interval_list")

    logging.info("Building command in preparation for invocation.")
    cmd = build_cmd(args)
    proc = cmd_caller(cmd)
Beispiel #45
0
def tophat_map(gtf, out_dir, prefix, fastq, thread, bw=False, scale=False,
               gtf_flag=1):
    '''
    1. Map reads with TopHat2
    2. Extract unmapped reads
    3. Create BigWig file if needed
    '''
    # tophat2 mapping
    print('Map reads with TopHat2...')
    tophat_cmd = 'tophat2 -g 1 --microexon-search -m 2 '
    if gtf_flag:
        tophat_cmd += '-G %s ' % gtf
    tophat_cmd += '-p %s -o %s ' % (thread, out_dir + '/tophat')
    tophat_cmd += '%s/bowtie2_index/%s ' % (out_dir, prefix) + fastq
    tophat_cmd += ' 2> %s/tophat.log' % out_dir
    print('TopHat2 mapping command:')
    print(tophat_cmd)
    return_code = os.system(tophat_cmd) >> 8
    if return_code:
        sys.exit('Error: cannot map reads with TopHat2!')
    # extract unmapped reads
    print('Extract unmapped reads...')
    unmapped_bam = pybedtools.BedTool('%s/tophat/unmapped.bam' % out_dir)
    unmapped_bam.bam_to_fastq(fq='%s/tophat/unmapped.fastq' % out_dir)
    # create Bigwig file if needed
    if bw:
        if which('bedGraphToBigWig') is not None:
            print('Create BigWig file...')
            map_bam_fname = '%s/tophat/accepted_hits.bam' % out_dir
            # index bam if not exist
            if not os.path.isfile(map_bam_fname + '.bai'):
                pysam.index(map_bam_fname)
            map_bam = pysam.AlignmentFile(map_bam_fname, 'rb')
            # extract chrom size file
            chrom_size_fname = '%s/tophat/chrom.size' % out_dir
            with open(chrom_size_fname, 'w') as chrom_size_f:
                for seq in map_bam.header['SQ']:
                    chrom_size_f.write('%s\t%s\n' % (seq['SN'], seq['LN']))
            if scale:  # scale to HPB
                mapped_reads = map_bam.mapped
                for read in map_bam:
                    read_length = read.query_length
                    break
                s = 1000000000.0 / mapped_reads / read_length
            else:
                s = 1
            map_bam = pybedtools.BedTool(map_bam_fname)
            bedgraph_fname = '%s/tophat/accepted_hits.bg' % out_dir
            with open(bedgraph_fname, 'w') as bedgraph_f:
                for line in map_bam.genome_coverage(bg=True,
                                                    g=chrom_size_fname,
                                                    scale=s, split=True):
                    value = str(int(float(line[3]) + 0.5))
                    bedgraph_f.write('\t'.join(line[:3]) + '\t%s\n' % value)
            bigwig_fname = '%s/tophat/accepted_hits.bw' % out_dir
            return_code = os.system('bedGraphToBigWig %s %s %s' %
                                    (bedgraph_fname, chrom_size_fname,
                                     bigwig_fname)) >> 8
            if return_code:
                sys.exit('Error: cannot convert bedGraph to BigWig!')
        else:
            print('Could not find bedGraphToBigWig, so skip this step!')
def getsamplecoverage(currsample, sampledata, genelist,geneseqs,maxmismatches = None, minextend = None): 
    
    currbam = sampledata.getbam(currsample)
    allcoverages = dict()
    multaminocoverages = dict()
    multaccoverages = dict()
    multtrnacoverages = dict()
    uniquecoverages = dict()
    uniquegenomecoverages = dict()
    multigenomecoverages = dict()
    #print >>sys.stderr, trnalist
    readmismatches = dict()
    
    adeninemismatches = dict()
    thyminemismatches = dict()
    cytosinemismatches = dict()
    guanosinemismatches = dict()
    readstarts = dict()
    readends = dict()
    readskips = dict()      
    
    trimreadcoverage =  dict()
    trimreadmismatches =  dict()
    
    readcounts = dict()
    
    skipped = 0
    total = 0
    
    try:
        #print >>sys.stderr, currbam
        if not os.path.isfile(currbam+".bai"):
            pysam.index(""+currbam)
        bamfile = pysam.Samfile(""+currbam, "rb" )  
    except IOError as ( strerror):
        print >>sys.stderr, strerror
        sys.exit()
        
    for i, currfeat in enumerate(genelist):
        #if currfeat.name != "FEATURE399_minus_145255":
        #    continue
        allcoverages[currfeat.name] = readcoverage(genelist[i])
        uniquegenomecoverages[currfeat.name] = readcoverage(genelist[i])
        multigenomecoverages[currfeat.name] = readcoverage(genelist[i])
        
        readstarts[currfeat.name] = readcoverage(genelist[i])
        readends[currfeat.name] = readcoverage(genelist[i])
        readmismatches[currfeat.name] = readcoverage(genelist[i])
        adeninemismatches[currfeat.name] =   readcoverage(genelist[i])
        thyminemismatches[currfeat.name] =   readcoverage(genelist[i])
        cytosinemismatches[currfeat.name] =  readcoverage(genelist[i])
        guanosinemismatches[currfeat.name] = readcoverage(genelist[i])
        readskips[currfeat.name] = readcoverage(genelist[i])
        
        trimreadcoverage[currfeat.name] =  readcoverage(genelist[i])
        trimreadmismatches[currfeat.name] =  readcoverage(genelist[i])
        readcounts[currfeat.name] = 0

        #print >>sys.stderr, trnalist[i]
        for currread in getbam(bamfile, genelist[i]):
            

            if maxmismatches is not None and currread.getmismatches() > maxmismatches:
                continue
            #print >>sys.stderr, "||**||"+str(currread.getmismatches())
            if genelist[i].coverage(currread) > 10 and genelist[i].strand == currread.strand:
                
                if minextend is not None and not (currread.start + minextend <= trnalist[i].start or currread.end - minextend >= trnalist[i].end):
                    continue
                total += 1
                trnaname = genelist[i].name
                readstart = currread.getfirst(1)
                readend = currread.getlast(1)
                readcounts[trnaname] += 1
                allcoverages[genelist[i].name].addread(currread)
                readstarts[trnaname].addread(readstart)
                readends[trnaname].addread(readend )

                if currread.issinglemapped():
                    uniquegenomecoverages[genelist[i].name].addread(currread)
                else:
                    multigenomecoverages[genelist[i].name].addread(currread)
                
                currseq = currread.getseq()
                geneseq = geneseqs[genelist[i].name]
                
                
                #if currread.start < currfeat.start  or currread.end < currfeat.end  or genelist[i].strand == "+":  #and currread.end > currfeat.end
                    #pass
                
                if genelist[i].strand == "+":
                    genestart = max([0,currread.start - currfeat.start])
                    geneend = genestart+cigarreflength(currread.getcigar())  - max([0,currfeat.start - currread.start])
                    #geneend = min([genestart+cigarreflength(currread.getcigar()) + 1])  #minimum of either the 
                else:
                    genestart = max([0,currfeat.end - currread.end])
                    geneend = genestart+cigarreflength(currread.getcigar())  - max([0,currread.end - currfeat.end])
                    #geneend = min([genestart+len(currseq) + 1 ,genestart+ cigarreadlength(currread.getcigar()) - 1 ])
                refseq = geneseq[genestart:geneend]
                

                

                #print >>sys.stderr, genestart
                #print >>sys.stderr, geneend
                #print >>sys.stderr, currread.getcigar()
                #
                #print >>sys.stderr, cigarreadlength(currread.getcigar())
                #print >>sys.stderr, len(currseq)
                #print >>sys.stderr, refseq
                
                if genelist[i].strand == "+":
                    readstart = max([0,currfeat.start - currread.start])
                    
                    #readend = readstart + cigarreflength(currread.getcigar())
                    readend = min([readstart +len(refseq)  ,readstart + cigarreflength(currread.getcigar())])
                else:
                    readstart = max([0,currread.end - currfeat.end])
                    #readend = readstart + cigarreflength(currread.getcigar())
                    readend = min([readstart +len(refseq)  ,readstart + cigarreflength(currread.getcigar())])
                
                #print >>sys.stderr, readcov
                readcov = list(cigarrefcoverage(currread.getcigar()))
                if genelist[i].strand == "-":
                    readcov = list(cigarrefcoverage(reversed(currread.getcigar())))
                    #readcov = reversed(readcov)
                    pass
                
                alignseq = "".join(currseq[sum(readcov[0:i])] if readcov[i] > 0 else "-" for i in range(cigarreflength(currread.getcigar())))
                alignseq = alignseq[readstart:readend]
                #print >>sys.stderr, alignseq
                #
                #refseq = ""
                #if genestart < 0:
                #    refset = refseq + ("-"*(-genestart))
                #    genestart = 0
                #
                #if geneend >= len(geneseqs[currfeat.name]):
                #    
                #    geneend = len(geneseqs[currfeat.name]) - 1
                #refseq = refseq + geneseqs[currfeat.name][genestart:geneend]
                
                
                #if currread.name == "NB501427:156:H2F7MAFXY:3:21609:5222:18520":
                #if len(refseq) != 0:
                
                #print >>sys.stderr, currread.getcigar()
                #print >>sys.stderr, currfeat.start
                #print >>sys.stderr, currread.start 
                #print >>sys.stderr, genestart
                #print >>sys.stderr, currfeat.name
                #print >>sys.stderr, currfeat.length()
                #print >>sys.stderr, alignseq
                #print >>sys.stderr, refseq    
                    
                #if cigarreflength(currread.getcigar()) < cigarreadlength(currread.getcigar()):
                #    print >>sys.stderr, currread.name
                #    print >>sys.stderr, currread.getcigar()
                #    print >>sys.stderr, "".join(str(curr) for curr in readcov)
                #    print >>sys.stderr, alignseq 
                #    print >>sys.stderr,refseq
                #    print >>sys.stderr, currseq
                #if len(refseq) != len(currseq):
                #    print >>sys.stderr, currread.name
                #    print >>sys.stderr,refseq
                #    print >>sys.stderr, currseq
                skipends = True
                
                
                #need to check mismatches later
                #if  currread.name == "NB501427:404:HJ3WGAFX2:3:11602:17144:17931":
                
              
                #if refseq != alignseq and len(currread.getcigar()) > 1 and genelist[i].strand == "-":# or cigarreflength(currread.getcigar()) != len(refseq):
                #    #skipped += 1
                #    print >>sys.stderr, currread.name
                #    print >>sys.stderr, currread.getcigar() 
                #    print >>sys.stderr, genelist[i].strand
                #    print >>sys.stderr,currseq
                #    print >>sys.stderr, "gene: "+str(currfeat.start)+"-"+str(currfeat.end)
                #    print >>sys.stderr, "read:" +str(currread.start)+"-"+str(currread.end)
                #    print >>sys.stderr,  "gene: "+str(genestart)+"-"+str(geneend)
                #    print >>sys.stderr,  "read: " +str(readstart)+"-"+str(readend)
                #    print >>sys.stderr,refseq
                #    print >>sys.stderr, alignseq
                #    pass
                #    #continue
                #else:
                #    #continue
                #    pass
                for currpos in range(len(refseq)): #30
                    currgenomepos = currread.start + currpos
                    if currfeat.strand == "-":
                        currgenomepos = currread.end - currpos
                    
                    if currpos < 0 or currpos >= len(refseq):
                        #print >>sys.stderr, currread.name
                        #print >>sys.stderr, currpos
                        #print >>sys.stderr, len(refseq)
                        #print >>sys.stderr, cigarreflength(currread.getcigar())
                        #
                        #print >>sys.stderr,refseq
                        #print >>sys.stderr, currseq
                        pass
  
                    if skipends:
                        if currpos < 3 or currpos > len(refseq) - 3 :
                            #continue
                            pass
                    currbase = alignseq[currpos]


                    refbase = refseq[currpos]
                    if refbase not in gapchars:

                        if refbase != currbase:
                            #if (currpos + currread.start) - readmismatches[trnaname].region.start < 0:
                            #    print >>sys.stderr, "before start: "+str(currpos)+"+"+str(currread.start) +"-"+str(readmismatches[trnaname].region.start)
                            #    #base - self.region.start
                            #print >>sys.stderr, alignseq
                            #print >>sys.stderr, refseq  
                            #print >>sys.stderr, ("-"*currpos)+"*"+("-"*(len(refseq)-currpos - 1))
                            #print >>sys.stderr, str(currread.start + currpos)
                            readmismatches[trnaname].addbase(currgenomepos)
                        #allcoverages[trnaname].addbase(currread.start + currpos)

                        #allcoverages[genelist[i].name].addbase(currread.start + currpos)
                        #if currpos > 3:
                        #    trimreadcoverage[trnaname].addbase(currread.start + currpos)
                        #    if refbase != currbase:
                        #        trimreadmismatches[trnaname].addbase(currread.start + currpos)
                        if currbase == "-":
                            readskips[trnaname].addbase(currgenomepos)
                        if currbase == "A":
                            adeninemismatches[trnaname].addbase(currgenomepos)
                        elif currbase == "T":
                            thyminemismatches[trnaname].addbase(currgenomepos)
                        elif currbase == "C":
                            cytosinemismatches[trnaname].addbase(currgenomepos)
                        elif currbase == "G":
                            guanosinemismatches[trnaname].addbase(currgenomepos)
    #print >>sys.stderr, currsample+":" +str(skipped)+"/"+str(total)+":"+str(((1.*skipped)/total))
    return coverageinfo( readcounts, allcoverages,readstarts, readends,multaminocoverages, multaccoverages, multtrnacoverages,uniquecoverages, uniquegenomecoverages,multigenomecoverages, readmismatches,adeninemismatches,thyminemismatches,cytosinemismatches, guanosinemismatches,readskips,trimmismatches = trimreadmismatches, trimcoverage = trimreadcoverage  )
Beispiel #47
0
def PE_Mod(PE, strand, path_bam, output_path):
    if PE == 'True':
        #print "\nProcessing PE bam file...may take some time"
        base_name = os.path.basename(path_bam)
        out_name = output_path + "/" + base_name.split(
            ".bam")[0] + ".sorted_PE.bam"
        infile = pysam.AlignmentFile(path_bam, "rb")
        sam_strng = subprocess.getstatusoutput('samtools')
        temp = sam_strng[1].split("Version:")[1][:7]
        sam_version = int(''.join(list(filter(str.isdigit, temp))))
        if sam_version == 119:
            outfile = pysam.AlignmentFile(
                output_path + "/" + base_name.split(".bam")[0] + "_temp.bam",
                "wb",
                template=infile)
        else:
            outfile = pysam.AlignmentFile(
                output_path + "/" + base_name.split(".bam")[0] + "_temp.bam",
                "wb",
                add_sam_header=True,
                template=infile)

        for read in infile:
            Flag = read.flag
            if strand == 'second':
                if (Flag == 145 or Flag == 147 or Flag == 153 or Flag == 97
                        or Flag == 99 or Flag == 73):
                    if Flag < 100:
                        read.flag = 145
                    read.set_tag("XS", '-', replace=True)
                    outfile.write(read)
                if (Flag == 161 or Flag == 163 or Flag == 81 or Flag == 83
                        or Flag == 89 or Flag == 137):
                    if Flag < 100:
                        read.flag = 161
                    read.set_tag("XS", '+', replace=True)
                    outfile.write(read)
            if strand == 'first':
                if (Flag == 145 or Flag == 147 or Flag == 153 or Flag == 97
                        or Flag == 99 or Flag == 73):
                    if Flag > 100:
                        read.flag = 97
                    read.set_tag("XS", '+', replace=True)
                    outfile.write(read)
                if (Flag == 161 or Flag == 163 or Flag == 81 or Flag == 83
                        or Flag == 89 or Flag == 137):
                    if Flag > 100:
                        read.flag = 81
                    read.set_tag("XS", '-', replace=True)
                    outfile.write(read)

        infile.close()
        outfile.close()

        #print "Sorting PE bam file"
        if sam_version == 119:
            pysam.sort(
                "-o", out_name,
                output_path + "/" + base_name.split(".bam")[0] + "_temp.bam")
        else:
            pysam.sort(
                "-O", "BAM", "-T", output_path + "/" +
                base_name.split(".bam")[0] + "_temp.sorted", "-o", out_name,
                output_path + "/" + base_name.split(".bam")[0] + "_temp.bam")

        pysam.index(out_name)
        os.remove(output_path + "/" + base_name.split(".bam")[0] + "_temp.bam")

    if PE == 'False':
        out_name = path_bam

    return out_name
Beispiel #48
0
def main(inputs,
         output,
         bam_file,
         strand_specific,
         library,
         protocol,
         median_fragment_size,
         stdev_fragment_size,
         read_length,
         reference_genome,
         annotations,
         masking,
         aligner_reference,
         start_time=int(time.time()),
         **kwargs):
    """
    Args:
        inputs (list): list of input files containing the breakpoint pairs
        output (str): path to the output directory
        bam_file (str): path the bam file
        strand_specific (bool): flag to indicate the input bam is using a strand specific protocol
        median_fragment_size (int): the median fragment size
        stdev_fragment_size (int): the standard deviation in fragment size
        read_length (int): read length
        reference_genome (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genome`
        annotations (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genes`
        masking (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_masking_regions`
        aligner_reference (:class:`~mavis.annotate.file_io.ReferenceFile`): path to the aligner reference file (e.g 2bit file for blat)
    """
    mkdirp(output)
    # check the files exist early to avoid waiting for errors
    if protocol == PROTOCOL.TRANS:
        annotations.load()
    reference_genome.load()
    masking.load()

    validation_settings = {}
    validation_settings.update(DEFAULTS.items())
    validation_settings.update(
        {k: v
         for k, v in kwargs.items() if k in DEFAULTS})
    validation_settings = MavisNamespace(**validation_settings)

    raw_evidence_bam = os.path.join(output, 'raw_evidence.bam')
    contig_bam = os.path.join(output, 'contigs.bam')
    evidence_bed = os.path.join(output, 'evidence.bed')

    passed_output_file = os.path.join(output, PASS_FILENAME)
    passed_bed_file = os.path.join(output, 'validation-passed.bed')
    failed_output_file = os.path.join(output, 'validation-failed.tab')
    contig_aligner_fa = os.path.join(output, 'contigs.fa')
    if validation_settings.aligner == SUPPORTED_ALIGNER.BLAT:
        contig_aligner_output = os.path.join(output, 'contigs.blat_out.pslx')
        contig_aligner_log = os.path.join(output, 'contigs.blat.log')
    elif validation_settings.aligner == SUPPORTED_ALIGNER.BWA_MEM:
        contig_aligner_output = os.path.join(output, 'contigs.bwa_mem.sam')
        contig_aligner_log = os.path.join(output, 'contigs.bwa_mem.log')
    else:
        raise NotImplementedError('unsupported aligner',
                                  validation_settings.aligner)
    igv_batch_file = os.path.join(output, 'igv.batch')
    input_bam_cache = BamCache(bam_file, strand_specific)

    bpps = read_inputs(
        inputs,
        add_default={
            COLUMNS.cluster_id: None,
            COLUMNS.stranded: False
        },
        add={
            COLUMNS.protocol: protocol,
            COLUMNS.library: library
        },
        expand_strand=False,
        expand_orient=True,
        cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x})
    evidence_clusters = []
    for bpp in bpps:
        if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME:
            try:
                evidence = GenomeEvidence(
                    bpp.break1,
                    bpp.break2,
                    input_bam_cache,
                    reference_genome.content,
                    opposing_strands=bpp.opposing_strands,
                    stranded=bpp.stranded,
                    untemplated_seq=bpp.untemplated_seq,
                    data=bpp.data,
                    stdev_fragment_size=stdev_fragment_size,
                    read_length=read_length,
                    median_fragment_size=median_fragment_size,
                    **dict(validation_settings.items()))
                evidence_clusters.append(evidence)
            except ValueError as err:
                warnings.warn(
                    'Dropping breakpoint pair ({}) as bad input {}'.format(
                        str(bpp), str(err)))
        elif bpp.data[COLUMNS.protocol] == PROTOCOL.TRANS:
            try:
                evidence = TranscriptomeEvidence(
                    annotations.content,
                    bpp.break1,
                    bpp.break2,
                    input_bam_cache,
                    reference_genome.content,
                    opposing_strands=bpp.opposing_strands,
                    stranded=bpp.stranded,
                    untemplated_seq=bpp.untemplated_seq,
                    data=bpp.data,
                    stdev_fragment_size=stdev_fragment_size,
                    read_length=read_length,
                    median_fragment_size=median_fragment_size,
                    **dict(validation_settings.items()))
                evidence_clusters.append(evidence)
            except ValueError as err:
                warnings.warn('Dropping ({}) as bad input {}'.format(
                    str(bpp), str(err)))
        else:
            raise ValueError('protocol error', bpp.data[COLUMNS.protocol])

    extended_masks = {}
    for chrom, masks in masking.content.items(
    ):  # extend masking by read length
        extended_masks[chrom] = []
        for mask in masks:
            extended_masks[chrom].append(
                BioInterval(chrom,
                            mask.start - read_length,
                            mask.end + read_length,
                            name=mask.name))

    evidence_clusters, filtered_evidence_clusters = filter_on_overlap(
        evidence_clusters, extended_masks)
    contig_sequences = {}
    for i, evidence in enumerate(evidence_clusters):
        LOG()
        LOG('({} of {})'.format(i + 1, len(evidence_clusters)),
            'gathered evidence for:',
            evidence.cluster_id,
            '' if COLUMNS.tracking_id not in evidence.data else
            '(tracking_id: {})'.format(evidence.tracking_id),
            time_stamp=True)
        LOG(evidence, time_stamp=False)
        LOG('possible event type(s):',
            BreakpointPair.classify(evidence),
            time_stamp=False)
        LOG('outer window regions:  {}:{}-{}  {}:{}-{}'.format(
            evidence.break1.chr, evidence.outer_window1[0],
            evidence.outer_window1[1], evidence.break2.chr,
            evidence.outer_window2[0], evidence.outer_window2[1]),
            time_stamp=False)
        LOG('inner window regions:  {}:{}-{}  {}:{}-{}'.format(
            evidence.break1.chr, evidence.inner_window1[0],
            evidence.inner_window1[1], evidence.break2.chr,
            evidence.inner_window2[0], evidence.inner_window2[1]),
            time_stamp=False)
        evidence.load_evidence(log=LOG)
        LOG('flanking pairs: {};'.format(len(evidence.flanking_pairs)),
            'split reads: {}, {};'.format(
                *[len(a) for a in evidence.split_reads]),
            'half-mapped reads: {}, {};'.format(
                *[len(a) for a in evidence.half_mapped]),
            'spanning-reads: {};'.format(len(evidence.spanning_reads)),
            'compatible flanking pairs:',
            len(evidence.compatible_flanking_pairs),
            time_stamp=False)
        evidence.assemble_contig(log=LOG)
        LOG('assembled {} contigs'.format(len(evidence.contigs)),
            time_stamp=False)
        for contig in evidence.contigs:
            name = 'seq-{}'.format(
                hashlib.md5(contig.seq.encode('utf-8')).hexdigest())
            LOG('>',
                name,
                '(size={}; reads={:.0f}; coverage={:.2f})'.format(
                    len(contig.seq), contig.remap_score(),
                    contig.remap_coverage()),
                time_stamp=False)
            LOG(contig.seq[:140], time_stamp=False)
            contig_sequences[name] = contig.seq

    LOG('will output:', contig_aligner_fa, contig_aligner_output)
    raw_contig_alignments = align_sequences(
        contig_sequences,
        input_bam_cache,
        reference_genome=reference_genome.content,
        aligner_fa_input_file=contig_aligner_fa,
        aligner_output_file=contig_aligner_output,
        clean_files=validation_settings.clean_aligner_files,
        aligner=kwargs.get('aligner', validation_settings.aligner),
        aligner_reference=aligner_reference.name[0],
        aligner_output_log=contig_aligner_log,
        blat_min_identity=kwargs.get('blat_min_identity',
                                     validation_settings.blat_min_identity),
        blat_limit_top_aln=kwargs.get('blat_limit_top_aln',
                                      validation_settings.blat_limit_top_aln),
        log=LOG)
    for evidence in evidence_clusters:
        select_contig_alignments(evidence, raw_contig_alignments)
    LOG('alignment complete', time_stamp=True)
    event_calls = []
    total_pass = 0
    write_bed_file(
        evidence_bed,
        itertools.chain.from_iterable(
            [e.get_bed_repesentation() for e in evidence_clusters]))
    validation_counts = {}
    for index, evidence in enumerate(evidence_clusters):
        LOG()
        LOG('({} of {}) calling events for: {} {} (tracking_id: {})'.format(
            index + 1, len(evidence_clusters), evidence.cluster_id,
            evidence.putative_event_types(), evidence.tracking_id),
            time_stamp=True)
        LOG('source:', evidence)
        calls = []
        failure_comment = None
        try:
            calls = call_events(evidence)
            event_calls.extend(calls)
        except UserWarning as err:
            LOG('warning: error in calling events', repr(err))
            failure_comment = str(err)

        if not calls:
            failure_comment = [
                'zero events were called'
            ] if failure_comment is None else failure_comment
            evidence.data[COLUMNS.filter_comment] = failure_comment
            filtered_evidence_clusters.append(evidence)
        else:
            total_pass += 1

        LOG('called {} event(s)'.format(len(calls)), time_stamp=True)
        for call in calls:
            LOG(call)
            if call.call_method == CALL_METHOD.CONTIG:
                LOG('\t{} {} [{}] contig_alignment_score: {}, contig_alignment_mq: {} contig_alignment_rank: {}'
                    .format(call.event_type, call.call_method,
                            call.contig_alignment.query_name,
                            round(call.contig_alignment.score(), 2),
                            tuple(call.contig_alignment.mapping_quality()),
                            tuple(call.contig_alignment.alignment_rank())))
                LOG('\talignment:', call.contig_alignment.alignment_id())
            elif call.contig_alignment:
                LOG(
                    '\t{} {} alignment:'.format(call.event_type,
                                                call.call_method),
                    call.contig_alignment.alignment_id())
            else:
                LOG('\t{} {}'.format(call.event_type, call.call_method),
                    time_stamp=False)
            validation_counts[call.cluster_id] = validation_counts.get(
                call.cluster_id, 0) + 1
            call.data[COLUMNS.validation_id] = '{}-v{}'.format(
                call.cluster_id, validation_counts[call.cluster_id])
            LOG('\tremapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]'
                ', flanking pairs: {}{}'.format(
                    0 if not call.contig else len(call.contig.input_reads),
                    len(call.spanning_reads),
                    len(call.break1_split_read_names()),
                    len(call.break1_split_read_names(tgt=True)),
                    len(call.break2_split_read_names()),
                    len(call.break2_split_read_names(tgt=True)),
                    len(call.linking_split_read_names()),
                    len(call.flanking_pairs),
                    '' if not call.has_compatible else '(' +
                    str(len(call.compatible_flanking_pairs)) + ')'))

    # write the output validated clusters (split by type and contig)
    for i, call in enumerate(event_calls):
        b1_homseq = None
        b2_homseq = None
        try:
            b1_homseq, b2_homseq = call.breakpoint_sequence_homology(
                reference_genome.content)
        except AttributeError:
            pass
        call.data.update({
            COLUMNS.break1_homologous_seq: b1_homseq,
            COLUMNS.break2_homologous_seq: b2_homseq,
        })
    LOG('{} putative calls resulted in {} events with 1 or more event call'.
        format(len(evidence_clusters), total_pass),
        time_stamp=True)
    output_tabbed_file(event_calls, passed_output_file)
    output_tabbed_file(filtered_evidence_clusters, failed_output_file)
    write_bed_file(
        passed_bed_file,
        itertools.chain.from_iterable(
            [e.get_bed_repesentation() for e in event_calls]))

    if validation_settings.write_evidence_files:
        with pysam.AlignmentFile(contig_bam, 'wb',
                                 template=input_bam_cache.fh) as fh:
            LOG('writing:', contig_bam, time_stamp=True)
            for evidence in evidence_clusters:
                for contig in evidence.contigs:
                    for aln in contig.alignments:
                        aln.read1.cigar = _cigar.convert_for_igv(
                            aln.read1.cigar)
                        fh.write(aln.read1)
                        if aln.read2:
                            aln.read2.cigar = _cigar.convert_for_igv(
                                aln.read2.cigar)
                            fh.write(aln.read2)

        # write the evidence
        with pysam.AlignmentFile(raw_evidence_bam,
                                 'wb',
                                 template=input_bam_cache.fh) as fh:
            LOG('writing:', raw_evidence_bam, time_stamp=True)
            reads = set()
            for evidence in evidence_clusters:
                reads.update(evidence.supporting_reads())
            for read in reads:
                read.cigar = _cigar.convert_for_igv(read.cigar)
                fh.write(read)
        # now sort the contig bam
        sort = re.sub(r'.bam$', '.sorted.bam', contig_bam)
        LOG('sorting the bam file:', contig_bam, time_stamp=True)
        pysam.sort('-o', sort, contig_bam)
        contig_bam = sort
        LOG('indexing the sorted bam:', contig_bam)
        pysam.index(contig_bam)

        # then sort the evidence bam file
        sort = re.sub(r'.bam$', '.sorted.bam', raw_evidence_bam)
        LOG('sorting the bam file:', raw_evidence_bam, time_stamp=True)
        pysam.sort('-o', sort, raw_evidence_bam)
        raw_evidence_bam = sort
        LOG('indexing the sorted bam:', raw_evidence_bam)
        pysam.index(raw_evidence_bam)

        # write the igv batch file
        with open(igv_batch_file, 'w') as fh:
            LOG('writing:', igv_batch_file, time_stamp=True)

            fh.write('load {} name="{}"\n'.format(passed_bed_file,
                                                  'passed events'))
            fh.write('load {} name="{}"\n'.format(contig_bam,
                                                  'aligned contigs'))
            fh.write('load {} name="{}"\n'.format(evidence_bed,
                                                  'evidence windows'))
            fh.write('load {} name="{}"\n'.format(raw_evidence_bam,
                                                  'raw evidence'))
            fh.write('load {} name="{} {} input"\n'.format(
                bam_file, library, protocol))
Beispiel #49
0
				clustered = False
				for cluster  in clusters:
					if chr1 == cluster.chr1 and chr2 == cluster.chr2 and abs(pos1 - cluster.pos1) < rlen and abs(pos2 - cluster.pos2) < isize:
						cluster.pos1 = (pos1 + cluster.pos1)/2
						cluster.pos2 = (pos2 + cluster.pos2)/2
						cluster.number += 1
						clustered = True
						break
				if not clustered:
					cluster = Cluster(chr1, pos1, chr2, pos2, 1)
					clusters.append(cluster)
				
			
	discordant_bam.close()	
	pysam.sort("discordant.bam", out_dir+"/discordant")
	pysam.index("discordant.bam")

	supp_cluster_num = 0
	for cluster in clusters:
		if cluster.number > 2:
			print >> sys.stderr, "Cluster:", cluster.tostring()
			supp_cluster_num += 1
	#--- output dna supp fusion only ---#
	#if supp_cluster_num:
	#	print "Fusion:", line.strip(), supp_cluster_num

	#--- output all fusions ---#
	fusion.dnasupp = supp_cluster_num
	print fusion.tostring()
	#print "Fusion:", line.strip(), supp_cluster_num
	#print >> sys.stderr, script_dir + "/cluster2.sh " + out_dir + "/discordant.bam header " + out_dir
def ngscat(bamfilenames, originalbedfilename, outdir, reference=None, saturation=False, nthreads=2, extend=None, depthlist='auto', coveragethresholds=[1,5,10,20,30],
		   onefeature=None, tmpdir=None):
	
	global TMP
	
	if(tmpdir<>None):
		if(os.path.isdir(tmpdir) or os.path.islink(tmpdir)):
			TMP = tmpdir
		else:
			print 'ERROR: temporary directory '+tmpdir+' does not exist.'
			print '	Exiting'
			sys.exit(1)
		
	if(not (os.path.isdir(outdir)  or os.path.islink(outdir))):
		print 'WARNING: '+outdir+' does not exist. Creating directory.'
		os.mkdir(outdir)

	if(not (os.path.isdir(outdir+'/data') or os.path.islink(outdir+'/data'))):		
		print 'Creating '+outdir+'/data'
		os.mkdir(outdir+'/data')

	if(not (os.path.isdir(outdir+'/img') or os.path.islink(outdir+'/img'))):
		print 'Creating '+outdir+'/img'
		os.mkdir(outdir+'/img')
		
	sortedbams = []
	for bamfilename in bamfilenames:
		filelink = TMP+'/'+os.path.basename(bamfilename)
		try:
			os.symlink(bamfilename, filelink)
		except OSError:
			print 'WARNING: when trying to create a symbolic link at the temporary directory pointing to '+bamfilename+', a file named '+filelink+' was already found.'
			print '	Probably the temporary and origin directories are the same. The only problem this could cause is that the new index overwrites an existing one.'
			print '	Continue (y/n)?'
			
			goahead = raw_input()
			if(goahead=='n' or goahead=='N'):
				print 'Exiting...'
				sys.exit(1)
			elif(goahead<>'y' and goahead<>'Y'):
				print 'Unknown choice '+goahead
				print 'Exiting...'
				sys.exit(1)
				
			if(os.path.dirname(bamfilename)<>os.path.dirname(TMP+'/')):
				os.remove(filelink)
				os.symlink(bamfilename, filelink)
		
		print 'Indexing...'
		pysam.index(filelink)
		print '	Done.'
			
		if(not bam_file.bam_file(filelink).issorted()):
			print 'WARNING: '+bamfilename+' is not sorted'
			print 'Sorting...'
			pid = str(time.time())
			newsortedbam = TMP+'/'+pid+'.sorted'
			sortedbams.append(newsortedbam+'.bam')
			pysam.sort(filelink, newsortedbam)
			print 'Indexing...'
			pysam.index(sortedbams[-1])		
			
			print '	Done.'			
		else:
			sortedbams.append(filelink)						
	

	if(saturation and depthlist=='auto'):
		maxdepth = max([bam_file.bam_file(bamfilename).nreads() for bamfilename in sortedbams])
		depthlist = numpy.arange(maxdepth/5.0, maxdepth+(maxdepth/5.0)-1, maxdepth/5.0)
		depthlist = depthlist/1000000.0
		
		
	legend = [os.path.basename(bamfilename) for bamfilename in bamfilenames]
	executiongranted = multiprocessing.Semaphore(nthreads)

	if(extend<>None): 
		bedfilename = TMP+'/'+originalbedfilename.replace('.bed','.'+pid+'.extended.bed')
		bed_file.bed_file(originalbedfilename).extendnoref(extend,bedfilename)
	else:
		bedfilename = originalbedfilename
			
	if(onefeature==None or onefeature<>'saturation' or onefeature<>'specificity'):			
		Pcoveragebeds,coveragefiles = launch_coveragebed(sortedbams, bedfilename, legend, outdir, executiongranted)
			
	if((saturation and onefeature==None) or onefeature=='saturation'):					
		Psaturation,coverage_saturation_status,saturationslopes = launch_coverage_saturation(sortedbams, bedfilename, depthlist, legend, outdir+'/data/', executiongranted)
	else:
		coverage_saturation_status = None
		saturationslopes = None
		

	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads,onoff_status,onduplicates,offduplicates,duplicates_status,enrichment,percontarget = launch_onoff_reads(sortedbams, bedfilename, legend, outdir+'/data/', executiongranted)	

	for i in range(len(Pcoveragebeds)):
		Pcoveragebeds[i].join()
		Pcoveragebeds[i].terminate()

	if(onefeature==None or onefeature=='specificity'):
		Poffclusters = launch_offclusters(glob.glob(outdir+'/data/*.bed'), bedfilename, executiongranted)	

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution,coveragedistribution_status,meancoverage = launch_coverage_distribution(coveragefiles, outdir+'/data/', legend, executiongranted)	

	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions,coveredpositions_status,coveredbases = launch_covered_positions(coveragefiles, coveragethresholds, outdir+'/data/', legend, executiongranted)

	if(onefeature==None or onefeature=='coveragedistr'):
		Pcoveragethroughtarget,throughtarget_status,lowcovbases = launch_coverage_through_target(coveragefiles, outdir+'/data/', legend, executiongranted)

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr,coveragecorr_status,corr = launch_coveragecorr(coveragefiles, outdir+'/data/coveragecorr.png', legend, executiongranted)
	else:
		coveragecorr_status = None
		corr = None

	if(onefeature==None or onefeature=='coveragestd'):	
		Pcoveragestd,coveragestd_status,coveragestd = launch_coverage_std(coveragefiles, outdir+'/data/', legend, executiongranted)

	if((reference<>None and onefeature==None) or onefeature=='gcbias'):
		Pgcbias = []
		for i,coveragefile in enumerate(coveragefiles):
			onePgcbias,gcbias_status = launch_gcbias(coveragefile, bedfilename, reference, outdir+'/data/gcbias'+str(i)+'.png', legend[i], executiongranted)
			Pgcbias.append(onePgcbias)
		for onePgcbias in Pgcbias:
			onePgcbias.join()
			onePgcbias.terminate()
	else:
		gcbias_status = None
		
	# LAUNCH BASIC STATS

	if((saturation and onefeature==None) or onefeature=='saturation'):
		Psaturation.join()
		Psaturation.terminate()		

	if(onefeature==None or onefeature=='coveragefreq'):
		Pcoveragedistribution.join()
		Pcoveragedistribution.terminate()
		
	if(onefeature==None or onefeature=='percbases'):
		Pcoveredpositions.join()
		Pcoveredpositions.terminate()
	
	if(onefeature==None or onefeature=='coveragedistr'):		
		Pcoveragethroughtarget.join()
		Pcoveragethroughtarget.terminate()

	if(len(coveragefiles)>1 and (onefeature==None or onefeature=='coveragecorr')):
		Pcoveragecorr.join()
		Pcoveragecorr.terminate()
		
	if(onefeature==None or onefeature=='coveragestd'):
		Pcoveragestd.join()
		Pcoveragestd.terminate()
		
	if(onefeature==None or onefeature=='specificity'):
		Ponoff_reads.join()
		Ponoff_reads.terminate()
	
		Poffclusters.join()
		Poffclusters.terminate()
		



		
#	if(onefeature==None or onefeature<>'saturation'):
#		for coveragefile in coveragefiles:
#			os.remove(coveragefile)
	
	if(onefeature==None):
		generate_report(bamfilenames,sortedbams,originalbedfilename,outdir,coveredpositions_status,coveredbases,coverage_saturation_status,saturationslopes,
						onoff_status,
						duplicates_status,onduplicates,offduplicates,coveragedistribution_status,meancoverage,
						coveragecorr_status,corr,throughtarget_status,lowcovbases,coveragestd_status,coveragestd,gcbias_status,enrichment,percontarget,
						reference,nthreads,depthlist,
						coveragethresholds)
Beispiel #51
0
def parseJunctionEntries(bam_dir,
                         multi=False,
                         Species=None,
                         ReferenceDir=None):
    global bam_file
    global splicesite_db
    global IndicatedSpecies
    global ExonReference
    IndicatedSpecies = Species
    ExonReference = ReferenceDir
    bam_file = bam_dir
    splicesite_db = {}
    chromosomes_found = {}

    start = time.time()
    try:
        import collections
        junction_db = collections.OrderedDict()
    except Exception:
        try:
            import ordereddict
            junction_db = ordereddict.OrderedDict()
        except Exception:
            junction_db = {}
    original_junction_db = copy.deepcopy(junction_db)

    bamf = pysam.Samfile(bam_dir, "rb")
    ### Is there are indexed .bai for the BAM? Check.
    try:
        for entry in bamf.fetch():
            codes = map(lambda x: x[0], entry.cigar)
            break
    except Exception:
        ### Make BAM Index
        if multi == False:
            print 'Building BAM index file for', bam_dir
        bam_dir = str(bam_dir)
        #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
        pysam.index(bam_dir)
        bamf = pysam.Samfile(bam_dir, "rb")

    chromosome = False
    barcode_pairs = {}
    bam_reads = 0
    count = 0
    jid = 1
    prior_jc_start = 0
    import Bio
    from Bio.Seq import Seq
    l1 = None
    l2 = None
    o = open(string.replace(bam_dir, '.bam', '.export2.txt'), "w")
    spacer = 'TGGT'
    for entry in bamf.fetch():
        #if entry.query_name == 'M03558:141:GW181002:1:2103:13361:6440':
        if spacer in entry.seq:
            if entry.seq.index(spacer) == 14:
                viral_barcode = entry.seq[:48]
                try:
                    mate = bamf.mate(entry)
                    mate_seq = Seq(mate.seq)
                    cell_barcode = str(mate_seq.reverse_complement())[:16]
                    if (viral_barcode, cell_barcode) not in barcode_pairs:
                        o.write(viral_barcode + '\t' + cell_barcode + '\n')
                    barcode_pairs[viral_barcode, cell_barcode] = []
                    if 'ATAGCGGGAACATGTGGTCATGGTACTGACGTTGACACGTACGTCATA' == viral_barcode:
                        print entry.query_name, cell_barcode, mate_seq
                except:
                    pass
                #print viral_barcode, mate.seq;sys.exit()

        count += 1
        #if count==100: sys.exit()
    bamf.close()
    o.close()
def main(mappings_sorted_bam, canonical_chr, mappings_sorted_bai=None):
    #
    # SECTION: Download inputs
    # --------------------------------------------------------------------------
    # mappings_sorted_bam and mappings_sorted_bai are passed to the main function
    # as parameters for our job. mappings_sorted_bam and mappings_sorted_bai are
    # dictionary objects with key=dnanexus_link and value=<file-id>.
    #
    # We handle file objects from the platform by first creating a DXFile handler.
    # Then performing dxpy.download_dxfile.
    #
    # If index file is not supplied *.bai index will be created with pysam.index
    #
    # DXFIle.name attribute is converted to ASCII since Pysam does not handle Unicode strings.
    #
    print(mappings_sorted_bai)
    print(mappings_sorted_bam)

    mappings_sorted_bam = dxpy.DXFile(mappings_sorted_bam)
    sorted_bam_name = mappings_sorted_bam.name
    dxpy.download_dxfile(mappings_sorted_bam.get_id(),
                         sorted_bam_name)
    ascii_bam_name = unicodedata.normalize(  # Pysam requires ASCII not Unicode string.
        'NFKD', sorted_bam_name).encode('ascii', 'ignore').decode('ascii')

    if mappings_sorted_bai is not None:
        mappings_sorted_bai = dxpy.DXFile(mappings_sorted_bai)
        dxpy.download_dxfile(mappings_sorted_bai.get_id(),
                             mappings_sorted_bai.name)
    else:
        pysam.index(ascii_bam_name)

    #
    # SECTION: Get chromosomes regions
    # --------------------------------------------------------------
    # Generate Pysam Alignmentfile object.
    #
    # Obtain regions to count.

    mappings_obj = pysam.AlignmentFile(ascii_bam_name, "rb")
    regions = get_chr(mappings_obj, canonical_chr)

    #
    # SECTION: Perform basic pysam count.
    # --------------------------------------------------------------
    # Iterate over regions and sum results of pysam.count().

    total_count = 0
    count_filename = "{bam_prefix}_counts.txt".format(
        bam_prefix=ascii_bam_name[:-4])

    with open(count_filename, "w") as f:
        for region in regions:
            temp_count = mappings_obj.count(region=region)
            f.write("{region_name}: {counts}\n".format(
                region_name=region, counts=temp_count))
            total_count += temp_count

        f.write("Total reads: {sum_counts}".format(sum_counts=total_count))

    #
    # SECTION:Output
    # ----------------------------------------------------------------------------
    # Upload generated count file as counts_txt output specified in the dxapp.json

    counts_txt = dxpy.upload_local_file(count_filename)
    output = {}
    output["counts_txt"] = dxpy.dxlink(counts_txt)

    return output
Beispiel #53
0
        with pysam.AlignmentFile(options.input,
                                 in_mode) as INPUT, pysam.AlignmentFile(
                                     options.output, out_mode,
                                     header=header) as OUTPUT:
            header_dict = {}
            n_header = 0
            for header in OUTPUT.header["SQ"]:
                header_dict[header['SN']] = n_header
                n_header += 1
            # print len(header_dict)
            for segment in INPUT:
                segment_output = map_to_genome(segment)
                if segment_output:
                    OUTPUT.write(segment_output)
    sys.stderr.write(
        "[%s]Finished.\n  Total: %d\n  Lifted: %d\n  Unlifted: %d\n\n" %
        (strftime("%Y-%m-%d %H:%M:%S",
                  time.localtime()), total, lifted, unlifted))

    if options.sort == True:
        sys.stderr.write("[%s]Sorting bam...\n" %
                         strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        pysam.sort("-o", options.output.replace(".bam", ".sorted.bam"),
                   options.output)
        if options.index == True:
            sys.stderr.write("[%s]Indexing bam...\n" %
                             strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            pysam.index(options.output.replace(".bam", ".sorted.bam"))
        if options.no_del_bam == False:
            os.remove(options.output)
Beispiel #54
0
    bam_writers[len_to_chunk[span]].write(rec)
    add_cigar_span(ref, span)
logger.write("Finished iterating through bam file.\n")

# Close the bam files
logger.write("\nClosing bam reader...\n")
bam_reader.close()
logger.write("\nClosing the bam writers...\n")
for bam_writer in bam_writers.values():
    bam_writer.close()

# Index the chunk bam files
logger.write("\nIndexing the chunk bam files...\n")
for chunk in len_chunks:
    logger.write("Chunk %s\n" % list(chunk))
    pysam.index(chunk_to_bam(chunk))

# Write bedgraph coverage file for original bam file
bedtool = pybedtools.BedTool(bam_file)
cov = bedtool.genome_coverage(bg=True, split=True)
cov.saveas("%s.bedgraph" % out_bedgraph_prefix)

# Write bedgraph coverage files for each chunk bam file for easier display in IGV
logger.write("\nWriting bedgraph coverage files...\n")


def chunk_to_bedgraph(chunk):
    return "%s_%s_%s.bedgraph" % (out_bedgraph_prefix, chunk[0], chunk[1])


for chunk in len_chunks:
def rewrite_bam(ref_mapping, args):
    """split bam file using pysam"""
    bam_watson_handle = pysam.AlignmentFile(args.watson)
    bam_crick_handle = pysam.AlignmentFile(args.crick)
    out_handles = {}
    #create new template
    header = bam_watson_handle.header
    header['SQ'] = [{
        'LN': ref_mapping['gene']['length'],
        'SN': 'gene'
    }, {
        'LN': ref_mapping['non_gene']['length'],
        'SN': 'non_gene'
    }]
    contig_index = {'gene': 0, 'non_gene': 1}
    for item in bam_watson_handle.header['RG']:
        watson_path = os.path.join(args.output_dir,
                                   '%s.watson.tmp' % item['SM'])
        crick_path = os.path.join(args.output_dir, '%s.crick.tmp' % item['SM'])
        watson_handle = pysam.AlignmentFile(watson_path, "wb", header=header)
        crick_handle = pysam.AlignmentFile(crick_path, "wb", header=header)
        out_handles[item['SM']] = {
            'watson': watson_handle,
            'crick': crick_handle
        }
    i = 0
    print 'start splitting Watson reads'
    for read in bam_watson_handle:
        i += 1
        if not i % 1000000:
            print 'processed %s reads' % i
        sample = '_'.join(dict(read.tags)['RG'].split('_')[2:])
        handle = out_handles[sample]['watson']
        #change read parameters depending on contig
        try:
            contig_name, contig_pos, contig_len = ref_mapping[
                read.reference_name]
        except KeyError:
            print '%s not found, continue nevertheless' % read.reference_name
            continue
        read.rname = contig_index[contig_name]
        read.mrnm = contig_index[contig_name]
        read.pos += contig_pos
        if read.is_paired:
            if read.is_proper_pair:
                read.pnext += contig_pos
        handle.write(read)
    for subdict in out_handles.values():
        subdict['watson'].close()
    i = 0
    print 'start splitting Crick reads'
    for read in bam_crick_handle:
        i += 1
        if not i % 1000000:
            print 'processed %s reads' % i
        sample = '_'.join(dict(read.tags)['RG'].split('_')[2:])
        handle = out_handles[sample]['crick']
        # change read parameters depending on contig
        try:
            contig_name, contig_pos, contig_len = ref_mapping[
                read.reference_name]
        except KeyError:
            print '%s not found, continue nevertheless' % read.reference_name
            continue
        read.rname = contig_index[contig_name]
        read.mrnm = contig_index[contig_name]
        read.pos += contig_pos
        if read.is_paired:
            if read.is_proper_pair:
                read.pnext += contig_pos
        handle.write(read)
    for subdict in out_handles.values():
        subdict['crick'].close()
    if not os.path.exists(os.path.join(args.output_dir, 'bam')):
        os.mkdir(os.path.join(args.output_dir, 'bam'))
    for item in bam_watson_handle.header['RG']:
        watson_tmp = os.path.join(args.output_dir,
                                  '%s.watson.tmp' % item['SM'])
        watson_tmp2 = os.path.join(args.output_dir,
                                   '%s.watson.tmp2' % item['SM'])
        watson_path = os.path.join(args.output_dir, 'bam',
                                   '%s.watson.bam' % item['SM'])
        crick_tmp = os.path.join(args.output_dir, '%s.crick.tmp' % item['SM'])
        crick_tmp2 = os.path.join(args.output_dir,
                                  '%s.crick.tmp2' % item['SM'])
        crick_path = os.path.join(args.output_dir, 'bam',
                                  '%s.crick.bam' % item['SM'])
        pysam.sort(watson_tmp, '-o', watson_tmp2)
        pysam.sort(crick_tmp, '-o', crick_tmp2)
        os.system('samtools calmd -b  %s %s > %s 2>/dev/null' %
                  (watson_tmp2, os.path.join(args.output_dir,
                                             'ref.fa'), watson_path))
        os.system(
            'samtools calmd -b  %s %s > %s 2>/dev/null' %
            (crick_tmp2, os.path.join(args.output_dir, 'ref.fa'), crick_path))
        pysam.index(watson_path)
        pysam.index(crick_path)
    os.system('rm %s/*.tmp*' % (args.output_dir))
    if ext == '.bam':
        os.remove(sfile)

    # close all open files and get sam/bam names
    for key, ofile in ofiles.items():
        fname = ofile.name
        ofile.close()
        ofiles[key] = [fname]
        ofiles[key].append(os.path.splitext(ofiles[key][0])[0] + '.bam')
        ofiles[key].append(os.path.splitext(ofiles[key][0])[0] + '_sorted.bam')

    # convert each sam file to bam
    for key, ofile in ofiles.items():
        pysam.view('-Sb', ofile[0], '-o', ofile[1], catch_stdout=False)
        pysam.sort(ofile[1], '-o', ofile[2], catch_stdout=False)
        pysam.index(ofile[2], catch_stdout=False)

        # write track to custom tracks file
        temp = write_track(key, os.path.basename(ofile[2]), colors_dict, sfile,
                           url)
        tfile.write(temp + '\n')

        # remove sam file and unsorted bam file
        os.remove(ofile[0])
        os.remove(ofile[1])

    # check if public directory exists
    if os.path.isdir(pubdir):
        # copy sorted bam and bai files to public directory
        for _, ofile in ofiles.items():
            # sorted bam
Beispiel #57
0
def indexed_bam(bam_file, config):
    if not os.path.exists(bam_file + ".bai"):
        pysam.index(bam_file)
    sam_reader = pysam.Samfile(bam_file, "rb")
    yield sam_reader
    sam_reader.close()
Beispiel #58
0
    def get_metagene_tag_count(self, bam, bam_path, transDF, file):
        '''
        Extract tags from bam files
        :return:
        '''
        index_bam = compare_bam_bai_creationtime(bam_path)
        if index_bam:
            try:
                print('Reindexing bam as bai is older', bam_path)
                pysam.index(bam_path)
            except:
                raise RuntimeError("Error in Bam indexing", bam_path)
        sample_bam = pysam.Samfile(bam_path, "rb")
        total_mapped = sample_bam.mapped
        file.write(bam+'\t'+str(total_mapped)+'\n')
        distribution_df = pd.DataFrame()
        distribution_df_norm = pd.DataFrame()
        #print(transDF.head())
        for ind, row in transDF.iterrows():  # reading peaksdf
            strand = row['strand']
            list_sample = []
            list_sample_norm = []
            Chr = str(row['chr'])
            start = row['start']
            stop = row['stop']
            interval = math.ceil((stop-start)/100.0)

            # 500bp upstream in 10 bins
            hstart = start - (interval*10)
            hstop = hstart + interval
            if start > 0:
                for i in range(0, 10):  # Please set based on distance on one side = s*distance/50
                    seqcount = sample_bam.count(Chr, hstart, hstop)
                    list_sample.append(seqcount)    # count real
                    list_sample_norm.append((seqcount*(5.*10**6)/total_mapped))    # Normalized count per million
                    hstart = hstop
                    hstop = hstart + interval  # divide peaks into length of 50 bp

            # gene body tag retrieval
            start = start
            stop = start + interval
            if start > 0:
                for i in range(0, 100):  # Please set based on distance on one side = s*distance/50
                    seqcount = sample_bam.count(Chr, start, stop)
                    list_sample.append(seqcount)    # count real
                    list_sample_norm.append((seqcount*(5.*10**6)/total_mapped))    # Normalized count per million
                    start = stop
                    stop = start + interval  # divide peaks into length of 50 bp

            # 500bp downstream in 10 bins
            tstart = stop
            tstop = tstart + interval
            if start > 0:
                for i in range(0, 10):  # Please set based on distance on one side = s*distance/50
                    seqcount = sample_bam.count(Chr, tstart, tstop)
                    list_sample.append(seqcount)    # count real
                    list_sample_norm.append((seqcount*(5.*10**6)/total_mapped))    # Normalized count per million
                    tstart = tstop
                    tstop = tstart + interval  # divide peaks into length of 50 bp

            # additional normalization based on permutation test
            if bam in self.external_sample_norm_factor.keys():
                list_sample_norm = [x*self.external_sample_norm_factor.get(bam) for x in list_sample_norm]

            if (strand == 1) or (strand == '+'):
                distribution_df = distribution_df.append(pd.Series(list_sample), ignore_index=True)
                distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm), ignore_index=True)

            elif (strand == -1) or (strand == '-'):
                distribution_df = distribution_df.append(pd.Series(list_sample[::-1]), ignore_index=True)
                distribution_df_norm = distribution_df_norm.append(pd.Series(list_sample_norm[::-1]), ignore_index=True)

            else:
                print('Problem with gene strand information:', row['chr'], '-', row['start'])
        sample_bam.close()  # closing bam file
        return distribution_df, distribution_df_norm
Beispiel #59
0
def main():
	# --------------------------------------------------------
	# PART 0: 	Parse input
	# --------------------------------------------------------
	parser = argparse.ArgumentParser(description='Extract and package reads within region')
	parser.add_argument('-v', '--verbose', action="store_true", default=False, required=False, dest="verbose", help="Use for verbose output with info on progress.")
	parser.add_argument('-b', '--bam', action="store", required=True, dest="bam", help="Sorted bam file created by aligning reads to the draft genome (refer to reads.sorted.bam in Nanopolish README).")
	parser.add_argument('-r', '--reads', action="store", dest="fa_filename", help="Fasta, fastq, fasta.gz, or fastq.gz file (refer to reads.fa in Nanopolish README)")
	parser.add_argument('-g', '--genome',  action="store", required=True, dest="draft_ga", help="Draft genome assembly (refer to draft.fa in Nanopolish README).")
	parser.add_argument('-w', '--window', action="store", required=True, dest="draft_ga_coords", help="Draft genome assembly coordinates wrapped in quotes ex. \"tig000001:10000-20000\".")
	parser.add_argument('-o', '--output_prefix', action="store", required=False, default="reads_subset", dest="output_prefix", help="Output prefix for tar.gz file and log file.")
	args = parser.parse_args()
	
	# Check to see if user used verbose option
	global verbose
	if args.verbose:
		verbose = True

	# Infer readdb file from fasta/q file
	readdb = args.fa_filename + ".index.readdb"

	custom_print( "===================================================" )
	custom_print( "Extract reads that align to given region" )
	custom_print( "Package all necessary files to reproduce error" )
	custom_print( "===================================================" )

	# --------------------------------------------------------
	# PART 1: 	Validate input
	# --------------------------------------------------------
	custom_print( "[ Input ]" )
	custom_print( "[+] Extracting from draft genome assembly coords: " + args.draft_ga_coords )
	custom_print( "[+] BAM file (reads.fa aligned to draft.fa): " + args.bam )
	custom_print( "[+] Readdb file: " + readdb )
	custom_print( "[+] Draft genome assembly (draft.fa): " + args.draft_ga )
	custom_print( "[+] FASTA/Q file (reads.fa): " + args.fa_filename )
	custom_print( "[+] Output prefix: " + args.output_prefix ) 

	custom_print( "[ Input check ]" )
	files = list()
	files.append(args.bam)
	files.append(readdb)
	files.append(args.fa_filename)
	files.append(args.draft_ga)
	draft_ga_fai = args.draft_ga + ".fai"
	files.append(draft_ga_fai)

	for i in files:
		if not os.path.exists(i) or not os.path.getsize(i) > 0 or not os.access(i, os.R_OK):
			print( "Expecting " + i + ". But does not exist, is empty or is not readable." )
			sys.exit(1)

	custom_print( "[ Validated input ] All input files exist, are not-empty, and are readable." )

	# --------------------------------------------------------
	# PART 2: 	Reassign input argument values	
	# --------------------------------------------------------
	# o = old/original, ga = genome assembly, fa = fasta/q file
	# coords = coordinates, op = output
	o_bam = args.bam
	o_readdb = readdb
	o_fa = args.fa_filename
	op = args.output_prefix
	draft_ga_coords = args.draft_ga_coords

	# --------------------------------------------------------
	# PART 3: 	With user input ref coords, extract all 
	#		aligned reads within these coordinates, 
	#		store read_ids, and fast5 files.
	# --------------------------------------------------------
	custom_print( "[ Extracting info on reads aligned to region ] \t" + draft_ga_coords )
	samfile = pysam.AlignmentFile(o_bam, "rb")
	region_read_ids = list()
	region_num_reads = 0

	# get all read ids of reads that are aligned to region in draft assembly
	for read in samfile.fetch(region=draft_ga_coords):
		id = read.query_name
		# add to list if not already in list
		if not id in region_read_ids:
			# store read id in list
			region_read_ids.append(id)
			# count number of reads that were aligned to the given region
			region_num_reads+=1

	# --------------------------------------------------------
	# PART 4:   Parse readdb file and find path to fast5 files
	# 		associated with each read that aligned to region
	# --------------------------------------------------------
	# readdb file has 2 columns: one indicating read_id and another indicating the fast5 file the read came from
	# each row represents a read
	custom_print( "[ Reading readdb file ]" )
	region_fast5_files = dict()
	with open (o_readdb, "r") as file:
		for line in file:
			l = line.split("\t")
			read_id = l.pop(0)
			if read_id in region_read_ids:
				fast5_file = l.pop(0)
				region_fast5_files[str(read_id)] = fast5_file.rstrip()

	# --------------------------------------------------------
	# PART 5:   Make a region BAM and BAI file
	# --------------------------------------------------------
	new_bam = "reads.bam"
	custom_print( "[ Writing to a new BAM file ] \t" + new_bam )
	region_reads = pysam.view("-b", o_bam, draft_ga_coords, "-o", new_bam, catch_stdout=False)
	
	new_bam_index = new_bam + ".bai"
	custom_print( "[ Writing to a new BAI file ] \t" + new_bam_index )
	pysam.index(new_bam, new_bam_index)

	# --------------------------------------------------------
	# PART 6: 	With user input ref coords, extract all 
	#		aligned	reads within these coordinates 
	#		and make new FASTA file
	# --------------------------------------------------------
	# detect type of sequences file then handle accordingly
	file_type = detect_fa_filetype(o_fa)
	new_fa = "reads.fasta"
	custom_print( "[ Writing to a new fasta file ]\t" +  new_fa )
	with open (new_fa, "w") as fout:
		if ".gz" in file_type:
			with gzip.open(o_fa, "rt") as fin:
				if "fasta.gz" in file_type:
					for record in SeqIO.parse(fin, "fasta"):
						if record.id in region_read_ids:
							fout.write(">" + record.id + "\n")
							fout.write(str(record.seq) + "\n")
				elif "fastq.gz" in file_type:
					for record in SeqIO.parse(fin, "fastq"):
						if record.id in region_read_ids:
							fout.write(">" + record.id + "\n")
							fout.write(str(record.seq) + "\n")
		else:
			with open(o_fa, "rt") as fin:
				if "fasta" in file_type:
					for record in SeqIO.parse(fin, "fasta"):
						if record.id in region_read_ids:
							fout.write(">" + record.id + "\n")
							fout.write(str(record.seq) + "\n")
				elif "fastq" in file_type:
					for record in SeqIO.parse(fin, "fastq"):
						if record.id in region_read_ids:
							fout.write(">" + record.id + "\n")
							fout.write(str(record.seq) + "\n")

	# --------------------------------------------------------
	# PART 7: 	Let's get to tarring
	# --------------------------------------------------------
	# While tarring, we need to fix the directory structure
	# such that the original path to files are not saved.
	# For each fast5 file we need to extract the basename,
	# and save it in tar such that we save only the basename,
	# and not the whole path from the original source.
	tar_filename = op + ".tar.gz"
	archive = tarfile.open(tar_filename, "w:gz")
	custom_print( "[ Creating a tar.gz file ] \t" + tar_filename )
	custom_print( "[+] FAST5 files: " + op + "/fast5_files/<FAST5 file(s)>" )
	for r in region_fast5_files.keys():
		read_id = r
		f5 = region_fast5_files[r]

		# get basename of fast5 file
		f5_basename = extract_basename(f5)
		an = op + "/fast5_files/" + f5_basename
		archive.add(f5, arcname=an)

	# --------------------------------------------------------
	# PART 8:	Add new files to tar
	# 			new fasta, new bam, and new bai with reads 
	#			in the region given only
	# --------------------------------------------------------
	an = op + "/" + new_fa
	archive.add(new_fa, arcname=an)
	custom_print( "[+] New FASTA: " + an )
	
	an_new_bam = op + "/" + new_bam
	archive.add(new_bam, arcname=an_new_bam)
	custom_print( "[+] New BAM: " + an_new_bam )

	an_new_bam_index = op + "/" + new_bam_index
	archive.add(new_bam_index, arcname=an_new_bam_index)
	custom_print( "[+] New BAI: " + an_new_bam_index )

	# --------------------------------------------------------
	# PART 9:	Add original draft genome assembly file
	#			and the index file
	# --------------------------------------------------------
	an_draft_ga = op + "/draft.fa"
	archive.add(args.draft_ga, arcname=an_draft_ga)
	custom_print( "[+] Original draft ga: " + an_draft_ga )

	an_draft_ga_fai = op + "/draft.fa.fai"
	archive.add(i, arcname=an_draft_ga_fai)
	custom_print( "[+] Original draft ga index: " + an_draft_ga_fai )

	# --------------------------------------------------------
	# PART 10: 	Check the number of reads in all new files
	# --------------------------------------------------------
	custom_print( "[ Output check ] " )
	# check the length of bam file
	num_reads_bam = region_num_reads
	num_reads_fasta = int(float(file_length(new_fa))/2.0)
	num_fast5_files = len(region_fast5_files)
	values = list()
	values.append(num_reads_bam)
	values.append(num_reads_fasta)
	custom_print( "[+] Num reads in new BAM: \t" + str(num_reads_bam) )
	custom_print( "[+] Num reads in new FASTA: \t" + str(num_reads_fasta) )
	custom_print( "[+] Num files in fast5_files/: \t" + str(num_fast5_files))
	if not all( v == num_fast5_files for v in values ):
		print( "[!] WARNING: The number of reads in the new bam, new fasta, and num of fast5 files tarred are not equal..." )
	else:
		custom_print( "[ Validated output ] Number of reads in the new bam, new fasta, and num of fast5 files tarred are equal!" )

	# --------------------------------------------------------
	# FINAL: 	Output log if verbose flag not used
	# --------------------------------------------------------
	global log
	logfile = op + ".log"
	with open (logfile, "w") as lfile:
		for s in log:
			lfile.write(s + "\n")
	an_logfile = op + "/" + logfile
	custom_print( "[ Log file ] " +  an_logfile )
	custom_print( "[ Tar file ] " + str(tar_filename) )
	custom_print( "[ Finished ] " )
	archive.add(logfile, arcname=an_logfile)
	archive.close()
Beispiel #60
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace",
                      dest="inplace",
                      action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.set_defaults(
        methods=[],
        output_sam=False,
        reference_bam=None,
        filter_methods=[],
        strip_method="all",
        force=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    bamfiles = []

    if options.stdin != sys.stdin:
        bamfiles.append(options.stdin.name)

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(pysam_in,
                                    pysam_out,
                                    pysam_ref,
                                    remove_nonunique="unique"
                                    in options.filter_methods,
                                    remove_unique="non-unique"
                                    in options.filter_methods,
                                    remove_contigs=None,
                                    remove_unmapped="mapped"
                                    in options.filter_methods,
                                    remove_mismatches=remove_mismatches,
                                    colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:

                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read

                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:

                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read

                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:

                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:

                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:

                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read

                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            # continue processing till end
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()