Esempio n. 1
0
def miraligner(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    hairpin, mirna = _download_mirbase(args)
    precursors = _read_precursor(args.hairpin, args.sps)
    matures = _read_mature(args.mirna, args.sps)
    gtf = _read_gtf(args.gtf)
    out_dts = []
    for bam_fn in args.files:
        sample = op.splitext(op.basename(bam_fn))[0]
        if bam_fn.endswith("bam") or bam_fn.endswith("sam"):
            logger.info("Reading %s" % bam_fn)
            bam_fn = _sam_to_bam(bam_fn)
            bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort"
            pysam.sort("-n", bam_fn, bam_sort_by_n)
            reads = _read_bam(bam_sort_by_n + ".bam", precursors)
        elif bam_fn.endswith("fasta") or bam_fn.endswith("fa") or bam_fn.endswith("fastq"):
            out_file = op.join(args.out, sample + ".premirna")
            bam_fn = _filter_seqs(bam_fn)
            if args.miraligner:
                _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin)
                reads = _read_miraligner(out_file)
            else:
                if bam_fn.endswith("fastq"):
                    bam_fn = _convert_to_fasta(bam_fn)
                logger.info("Aligning %s" % bam_fn)
                if not file_exists(out_file):
                    pyMatch.Miraligner(hairpin, bam_fn, out_file, 1, 4)
                reads = _read_pyMatch(out_file, precursors)
        else:
            raise ValueError("Format not recognized.")

        if not args.miraligner:
            reads = _annotate(reads, matures, precursors)
        out_file = op.join(args.out, sample + ".mirna")
        out_file, dt, dt_pre= _tab_output(reads, out_file, sample)
        try:
            vcf_file = op.join(args.out, sample + ".vcf")
            if not file_exists(vcf_file):
            # if True:
                create_vcf(dt_pre, matures, gtf, vcf_file)
            try:
                import vcf
                vcf.Reader(filename=vcf_file)
            except Exception as e:
                logger.warning(e.__doc__)
                logger.warning(e.message)
        except Exception as e:
            # traceback.print_exc()
            logger.warning(e.__doc__)
            logger.warning(e.message)
        if isinstance(dt, pd.DataFrame):
            out_dts.append(dt)

    if out_dts:
        _create_counts(out_dts, args.out)
        # _summarize(out_dts)
    else:
        print "No files analyzed!"
Esempio n. 2
0
    def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):

        logging.debug('LastzRunner: running on reference %s and query %s' %
                     (reference_fasta_path, query_fasta_path))
        output_sam_path = os.path.abspath(
            os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
        output_bam_unsorted_path = os.path.abspath(
            os.path.expandvars(output_bam_path + '.unsorted'))

        logging.debug(
            'LastzRunner: aligning with output in temporary sam file %s' %
            output_sam_path)
        with open(output_sam_path, 'w') as output_sam_handler:
            for line in self._align(reference_fasta_path, query_fasta_path, multiple):
                output_sam_handler.write(line)

        logging.debug(
            'LastzRunner: transforming sam into unsorted bam file %s' %
            output_bam_unsorted_path)
        input_sam_handler = pysam.Samfile(output_sam_path, "r")
        output_bam_file = pysam.Samfile(
            output_bam_unsorted_path, "wb", template=input_sam_handler)

        logging.debug(
            'LastzRunner: copying from sam file to bam file')
        for s in input_sam_handler:
            output_bam_file.write(s)
        output_bam_file.close()

        logging.debug('LastzRunner: sorting and indexing bam file %s' %
                      output_bam_path)
        pysam.sort(output_bam_unsorted_path,
                   output_bam_path.replace('.bam', ''))

        pysam.index(output_bam_path)
Esempio n. 3
0
def convertSortAlign(output_filename):
    # Pregenerate file names for all the intermediate steps (output_filename is the output of the Bowtie2 alignment)
    # Note that the file extension is not always given depending on the input conventions of the tool being called
    sam_filename=output_filename+'.sam'
    bam_filename=output_filename+'.bam'
    sorted_filename_input=output_filename+'_sorted'
    sorted_filename_output=output_filename+'_sorted.bam'
    
    # convert sam to bam
    print 'Converting {0} to {1} . . .'.format(sam_filename,bam_filename)
    try:
        SamtoBam(sam_filename,bam_filename)
    except Exception as ex:
        print "Error converting sam to bam ({0}): {1}".format(ex.errno, ex.strerror)
        return False   
    
    # sort
    print 'Sorting {0} -> {1}'.format(bam_filename,sorted_filename_output)
    try:
        pysam.sort(bam_filename,sorted_filename_input)
    except Exception as ex:
        print "Error sorting bam file ({0}): {1}".format(ex.errno, ex.strerror)
        return False   
    
    # index
    print 'Indexing {0} . . .'.format(sorted_filename_output)
    try:
        pysam.index(sorted_filename_output)
    except Exception as ex:
        print "Error indexing bam file ({0}): {1}".format(ex.errno, ex.strerror)
        return False   
    
    print
    print 'Done'
    return True
Esempio n. 4
0
def sort_by_position(bam_file, dir):

    ## get the file prefix
    prefix = ""
    prefix_match = re.match(r"(.*).bam", bam_file)

    try:
        prefix = prefix_match.group(1)
    except:
        print "Existing: Invalid bam file -i %s" %(bam_file)
        sys.exit(2)
        

    # sort the bam file
    bam_input = dir + bam_file
    sort_bam = dir +  prefix + "_sorted"
    pysam.sort(bam_input, sort_bam)
    sort_bam = sort_bam + ".bam"
    
    # index the sort bam file
    pysam.index(sort_bam)

    print ""
    print "Writing Sorted Bam File : %s" %(sort_bam)
    print "Writing Index Sorted Bam File : %s.bai" %(sort_bam)
    
    return sort_bam
Esempio n. 5
0
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path, args):
    work_dir = tempfile.mkdtemp( )
    genome_db = os.path.join( work_dir, "genome" )
    pe1_output = os.path.join( work_dir, "pe1.sai" )
    pe2_output = os.path.join( work_dir, "pe2.sai" )
    bwa_output = os.path.join( work_dir, "output.sam" )
    
    null = open( "/dev/null" ) #open("/tmp/bwa_out")#
    subprocess.check_call( [ "bwa", "index", "-p", genome_db, genome_path ], stderr = null )
    with open( pe1_output, "w" ) as pe1_file:
        subprocess.check_call( [ "bwa", "aln", genome_db, pe1_path ], stdout = pe1_file, stderr = null )
    
    with open( pe2_output, "w" ) as pe2_file:
        subprocess.check_call( [ "bwa", "aln", genome_db, pe2_path ], stdout = pe2_file, stderr = null )
    
    with open( bwa_output, "w" ) as bwa_file:
        subprocess.check_call( [ "bwa", "sampe",
                                "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout = bwa_file, stderr = null )
 


    if args.sam:
        shutil.move(bwa_output ,output_path+'.sam')
        #os.rename(bwa_output ,output_path+'.sam')
    else:
        sam_to_bam( bwa_output, bwa_output + ".bam" )
        if args.sort:
            # coordinate sort the file
            pysam.sort( bwa_output + ".bam", output_path )
            pysam.index(output_path+'.bam')
        else:
            shutil.move(bwa_output +".bam",output_path+'.bam')
Esempio n. 6
0
def disciple(bam_fname, bam_hdr, rg_id, long_qname_table, cigar_v2, in_queue):
  """Create a BAM file from the FASTQ lines fed to it via in_queue

  :param bam_fname:
  :param bam_hdr:
  :param rg_id:
  :param long_qname_table:
  :param cigar_v2:
  :param in_queue:
  :return:
  """
  logger.debug('Writing to {} ...'.format(bam_fname))
  t0 = time.time()
  fp = pysam.AlignmentFile(bam_fname, 'wb', header=bam_hdr)
  ref_dict = {k['SN']: n for n, k in enumerate(bam_hdr['SQ'])}
  cnt = 0
  for cnt, (qname, read_data) in enumerate(iter(in_queue.get, __process_stop_code__)):
    write_perfect_reads(qname, rg_id, long_qname_table, ref_dict, read_data, cigar_v2, fp)
  fp.close()
  t1 = time.time()
  logger.debug('... {}: {} reads in {:0.2f}s ({:0.2f} t/s)'.format(bam_fname, cnt, t1 - t0, cnt/(t1 - t0)))

  logger.debug('Sorting {} -> {}'.format(bam_fname, bam_fname + '.sorted'))
  t0 = time.time()
  pysam.sort('-m', '1G', '-o', bam_fname + '.sorted', bam_fname)
  os.remove(bam_fname)
  t1 = time.time()
  logger.debug('... {:0.2f}s'.format(t1 - t0))

  logger.debug('Shutting down thread for {}'.format(bam_fname))
def sort_output(outPrefix):
    '''Sorts the output file by read coordinate'''
    pysam.sort(outPrefix+'.originalSort.bam', outPrefix + '.coordSort')    
    #os.remove(outPrefix+'.originalSort.tmp.bam')
    
    ## Build the bam index for output    
    pysam.index(outPrefix + '.coordSort.bam')
Esempio n. 8
0
File: scatac.py Progetto: mfiers/rat
def makeAggregate(cells, directory, suffix, output):
    """
    Create aggregate sample.

    Make an aggregate bam file from a list of cells, sorts and indexes
    the file for easy use in IGV. Suffix is required to prevent non
    0-padded numbers matching the wrong files. Return final file name.

    Parameters
    ----------
    cells : list
        List of cell names to create aggregate from.
    directory : string
        Directory path with the bam files from each cell.
    suffix : string
        String to match the end of the bam file, use to add file extension
        and to anchor the extension after file numbers - this will prevent
        cell_4 matching cell_4*.
    output : string
        String containing output file location.
    """
    from glob import glob
    cells = set(cells)
    fileList = []
    for cell in cells:
        fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0])
    pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False)
    pysam.sort(output + ".bam", output + ".sorted", catch_stdout=False)
    pysam.index(output + ".sorted.bam", catch_stdout=False)

    return output + ".sorted.bam"
Esempio n. 9
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam")
        localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted")

        samToBamFile(self.samFile, localBamFile)
        pysam.sort(localBamFile, localSortedBamFile)
        pysam.index(localSortedBamFile + ".bam")
        pysam.faidx(self.referenceFastaFile)
        
        file_header = self.readFastqFile.split(".fastq")[0].split("/")[-1] +  "_" + self.referenceFastaFile.split(".fa")[0].split("/")[-1]
        consensus_vcf = os.path.join(self.outputDir, file_header + "_Consensus.vcf")
        consensus_fastq = os.path.join(self.outputDir, file_header + "_Consensus.fastq")

        system("samtools mpileup -Q 0 -uf %s %s | bcftools view -cg - > %s" \
                % (self.referenceFastaFile, localSortedBamFile + ".bam", consensus_vcf))
        system("vcfutils.pl vcf2fq %s > %s" % (consensus_vcf, consensus_fastq))
        system("rm -rf %s" % (self.referenceFastaFile + ".fai"))
        
        formatted_consensus_fastq = os.path.join(self.getLocalTempDir(), "Consensus.fastq")
        
        formatConsensusFastq(consensus_fastq, formatted_consensus_fastq)
        system("mv %s %s" % (formatted_consensus_fastq, consensus_fastq))
        
        self.finish()
Esempio n. 10
0
def main(infile, snp_dir, max_window=MAX_WINDOW_DEFAULT,
         is_paired_end=False, is_sorted=False):
    name_split = infile.split(".")
    
    if len(name_split) > 1:
        pref = ".".join(name_split[:-1])
    else:
        pref = name_split[0]
    
    if not is_sorted:
        pysam.sort(infile, pref + ".sort")
        infile = pref + ".sort"
        sort_file_name = pref + ".sort.bam"
    else:
        sort_file_name = infile

    keep_file_name = pref + ".keep.bam"
    remap_name = pref + ".to.remap.bam"
    remap_num_name = pref + ".to.remap.num.gz"

    if is_paired_end:
        fastq_names = [pref + ".remap.fq1.gz",
                       pref + ".remap.fq2.gz"]
    else:
        fastq_names = [pref + ".remap.fq.gz"]

    bam_data = BamScanner(is_paired_end, max_window, 
                          sort_file_name, keep_file_name, remap_name, 
                          remap_num_name, fastq_names, snp_dir)
    bam_data.run()
Esempio n. 11
0
def bwa_mem(pe1_path, pe2_path, genome_path, threads, output_path):
    print 'Aligning with bwa mem'
    start = time()
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")
    stderr_file = open(output_path+'.bwa.1','w')

    #null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=stderr_file)
    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "mem", "-t", threads,
                                genome_db, pe1_path, pe2_path ],
                              stdout=bwa_file,
                              stderr=stderr_file)

    elapsed = time() - start
    print 'Time elapsed for bwa mem: ', elapsed
    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
   
    shutil.rmtree(work_dir)
Esempio n. 12
0
def map_paired_reads(pe1_path, pe2_path, genome_path, output_path):
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")

    null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null)
    with open(pe1_output, "w") as pe1_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null)

    with open(pe2_output, "w") as pe2_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null)

    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "sampe",
                                "-r", "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout=bwa_file, stderr=null)

    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
Esempio n. 13
0
def saveReads(dataHub, nameExtra=None):
    if dataHub.args.save_reads:
        logging.info("* Saving relevant reads *")
        for i, sample in enumerate(dataHub):
            outbam_path = dataHub.args.save_reads
            if not outbam_path.endswith(".bam"):
                outbam_path += ".bam"

            if len(dataHub.samples) > 1:
                logging.debug("Using i = {}".format(i))
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i))

            if nameExtra is not None:
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra))

            logging.info("  Outpath: {}".format(outbam_path))

            # print out just the reads we're interested for use later
            bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam)
            for read in sample.reads:
                bam_small.write(read)

            for read in sample.readStatistics.reads:
                bam_small.write(read)

            bam_small.close()
            sorted_path = outbam_path.replace(".bam", ".sorted")
            pysam.sort(outbam_path, sorted_path)
            pysam.index(sorted_path+".bam")
Esempio n. 14
0
def bwa_sampe(pe1_path, pe2_path, genome_path, output_path):
    print 'Aligning with bwa aln/sampe'
    start = time()
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")

    null = open("/dev/null")
    subprocess.check_call([ "bwa", "index", "-p", genome_db, genome_path ], stderr=null)
    with open(pe1_output, "w") as pe1_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe1_path ], stdout=pe1_file, stderr=null)

    with open(pe2_output, "w") as pe2_file:
        subprocess.check_call([ "bwa", "aln", genome_db, pe2_path ], stdout=pe2_file, stderr=null)

    with open(bwa_output, "w") as bwa_file:
        subprocess.check_call([ "bwa", "sampe",
                                genome_db,
                                pe1_output, pe2_output,
                                pe1_path, pe2_path ], stdout=bwa_file, stderr=null)

    elapsed = time() - start
    print 'Time elapsed for bwa aln/sampe: ', elapsed

    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
    pysam.index(output_path + '.bam')
Esempio n. 15
0
File: tepy.py Progetto: timoast/TEpy
def check_bam(bam, p, make_new_index=False):
    """
    Sort and index bam file
    returns dictionary of chromosome names and lengths
    """
    # check if sorted
    test_head = pysam.AlignmentFile(bam, 'rb')
    chrom_sizes = {}
    p = str(p)
    for i in test_head.header['SQ']:
        chrom_sizes[i['SN']] = int(i['LN'])
    try:
        test_head.header['HD']['SO']
    except KeyError:
        print '  sorting bam file'
        pysam.sort('-@', p, bam, 'sorted.temp')
        os.remove(bam)
        os.rename('sorted.temp.bam', bam)
    else:
        if test_head.header['HD']['SO'] == 'coordinate':
            pass
        else:
            print '  sorting bam file'
            pysam.sort('-@', p, bam, 'sorted.temp')
            os.remove(bam)
            os.rename('sorted.temp.bam', bam)
    test_head.close()
    # check if indexed
    if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False:
        pass
    else:
        print '  indexing bam file'
        pysam.index(bam)
    return chrom_sizes
Esempio n. 16
0
def run_cufflinks(org_db, num_cpus=4):
    """
    run cufflinks program on mapped reads 
    """

    try:
        subprocess.call(["cufflinks"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `Cufflinks` binary is in your $PATH")
    
    org_name = org_db['short_name'] 
    print("preparing for cufflinks run for organism %s" % org_name)

    min_intron_length = 20
    min_isoform_frac = 0.25
    max_intron_length = org_db['max_intron_len']
    result_dir = org_db['read_assembly_dir']

    bam_file = "%s/%s_Aligned_mmr_sortbyCoord.bam" % (org_db['read_map_dir'], org_name)
    if not os.path.isfile(bam_file):
        sys.stdout.write("failed to fetch sorted mmr BAM file for organism: %s, trying to get the mmr file...\n" % org_name)
        bam_file = "%s/%s_Aligned_mmr.bam" % (org_db['read_map_dir'], org_name)
        if not os.path.isfile(bam_file):
            exit("error: failed to fetch mmr BAM file for organism %s" % org_name)
        
        ## sorting, indexing the bam file 
        file_prefix, ext = os.path.splitext(bam_file)
        sorted_bam = "%s_sortbyCoord" % file_prefix

        sys.stdout.write("trying to sort based by the coordinates with output prefix as: %s\n" % sorted_bam)
        if not os.path.isfile("%s.bam" % sorted_bam):
            pysam.sort(bam_file, sorted_bam)
            
        bam_file = "%s.bam" % sorted_bam

    print('using bam file from %s' % bam_file)
    if not os.path.exists(bam_file + ".bai"):
        pysam.index(bam_file) 

    ## always use quiet mode to avoid problems with storing log output.
    cli_cuff = "cufflinks -q --no-update-check \
        -F %.2f \
        -I %d \
        --min-intron-length %d \
        --library-type fr-unstranded \
        -p %d \
        -o %s \
        %s" % (min_isoform_frac, max_intron_length, min_intron_length, num_cpus, result_dir, bam_file)
  
    sys.stdout.write('\trun cufflinks as: %s \n' % cli_cuff)
    try:
        os.chdir(result_dir)
        process = subprocess.Popen(cli_cuff, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

    except Exception, e:
        print 'Error running cufflinks.\n%s' %  str( e )
Esempio n. 17
0
def convert_sam_to_bam():
    """
    This method should take a newly create .sam file from alignment and
        - convert it to .bam
        - sort .bam
        - index .bam
    """
    ids = generate_ids()
    for id in ids:
        start_time = time()
        print 'converting: %s'%id
        base_path = os.path.join(SAMPLE_DIR, id)
        sam_path = os.path.join(base_path, id+'-bwape.sam')
        bam_path = os.path.join(base_path, id+'-bwape.bam')

        bam_content = pysam.view('-bS', sam_path)
        bam_file = open(bam_path, 'w+')
        bam_file.writelines(bam_content)
        bam_file.close()

        pysam.sort(bam_path, bam_path+'_sorted')
        pysam.index(bam_path+'_sorted.bam')

        # indexing creates file.bam.bam. Move it to file.bam
        bam_call = "mv {0} {1}".format(bam_path+'_sorted.bam', bam_path)
        index_call = "mv {0} {1}".format(bam_path+'_sorted.bam.bai',
                                         bam_path+'.bam.bai')
        subprocess.call(bam_call, shell=True)
        subprocess.call(index_call, shell=True)
        end_time = time()
        print 'completed: %.3fs'%(end_time-start_time)
Esempio n. 18
0
def main():
    # Read options, args.
    parser = optparse.OptionParser()
    (options, args) = parser.parse_args()
    input_fname, output_fname = args
    slots = os.getenv('GALAXY_SLOTS', 1)
    pysam.sort("-@%s" % slots, '-o', output_fname, '-O', 'bam', '-T', '.', input_fname)
Esempio n. 19
0
def extend_bam(bam, type, reheader, size=0):
    
    bam_prefix = bam.split(".bam")[0]
    bam_file = pysam.Samfile(bam, 'rb')
    tmp_name = bam_prefix + ".bed"
    tmp_bed = open(tmp_name, 'w')
    size_name = str(size)
    if size == 0 : size_name = "insert"
    
    out_name = "_".join([bam_prefix, type, size_name]) + ".bam"
    out_bam = open(out_name, 'w')
    #pdb.set_trace()
    ## Convert BAM to temporary BED
    try:
        print "BAM to BED..."
        if type=="extend":
            bamToFragmentBed(bam_file, tmp_bed, size)
        elif type=="dyad":
            trimToDyad(bam_file, tmp_bed, size)
    except:
        print "BAM to BED conversion failed."
        print ">> " + ":".join(sys.exc_info()[1])
        tmp_bed.close()
        out_bam.close()
        os.remove(tmp_name)
        return
    else:
        print "BAM to BED conversion successful."
        tmp_bed.close()
        #out_bam.close()
    
    ## Convert tmp bed to bam
    bedToBam(tmp_name, out_name)
    
    ## Replace header
    if reheader:
        cmd_args1 = ['samtools', 'view', '-h', bam]
        cmd_args2 = ['samtools', 'reheader', '-', out_name]
        tmp_name = bam_prefix + "_tmp"
        tmp = open(tmp_name, 'w')
        try:
            print "Reheader..."
            p1 = Popen(cmd_args1, stdout=PIPE)
            p2 = Popen(cmd_args2, stdin=p1.stdout, stdout=tmp)
            p2.wait()
        except:
            print "Failed reheader"
            tmp.close()
            os.remove(tmp_name)
            return
        else:
            
            #os.remove(bam)
            tmp.close()
            #os.rename(tmp_name, out_name)
    print "Sorting..."    
    pysam.sort(out_name, out_name + "_sort")
    os.rename(out_name + "_sort.bam", out_name)
    pysam.index(out_name)
Esempio n. 20
0
def run_mmr(org_name, read_map_dir, threads=3):
    """
    a pythonic wrapper for multiple mapper resolution program

    @args org_name: Organism name, example case A_thaliana 
    @type org_name: str 
    @args read_map_dir: directory where the STAR bam (aligned reads) file located
    @type read_map_dir: str 
    @args threads: number of threads to use for the run (default: 3)
    @type threads: int  
    """
    import pysam

    try:
        subprocess.call(["mmr"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `mmr` binary is in your $PATH")

    ## mmr works well with bam file sorted by read id
    bam_file = "%s/%s_Aligned.sortedByName.out.bam" % (read_map_dir, org_name)
    if not os.path.isfile(bam_file):
        sys.stdout.write(
            "warning: failed to fetch read id sorted BAM file for organism: %s, trying to get the raw alignment file\n"
            % org_name
        )

        bam_file = "%s/%s_Aligned.out.bam" % (read_map_dir, org_name)  ## unsorted bam file from STAR output
        if not os.path.isfile(bam_file):
            exit("error: failed to fetch STAR read alignment file for %s %s\n" % (org_name, bam_file))

        ## sorting bam file
        sorted_bam = "%s/%s_Aligned.sortedByName.out" % (read_map_dir, org_name)
        if not os.path.isfile("%s.bam" % sorted_bam):
            sys.stdout.write("trying to sort based by read id with output prefix as: %s\n" % sorted_bam)
            pysam.sort("-n", bam_file, sorted_bam)

        bam_file = "%s.bam" % sorted_bam

    sys.stdout.write("using bam file from %s\n" % bam_file)
    outFile = "%s/%s_Aligned_mmr.bam" % (read_map_dir, org_name)

    iterations = 3
    ## provide a bam file sorted by read id
    cli_mmr = "module load gcc; mmr -b -p -V -t %d -I %d -o %s %s" % (threads, iterations, outFile, bam_file)

    try:
        sys.stdout.write("\trun MMR as: %s \n" % cli_mmr)
        ## changing the working dir to run mmr
        os.chdir(read_map_dir)

        process = subprocess.Popen(cli_mmr, shell=True)
        returncode = process.wait()

        if returncode != 0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("MMR run finished. result file stored at %s\n" % outFile)
    except Exception, e:
        exit("Error running MMR.\n%s" % str(e))
Esempio n. 21
0
    def run(self):

        # Phase 1 - Detection of BarCode
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),))
        
        self.bc.simple_approach()
        sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),))
        
        self.bc.write_barcodes(self.barcodes)
        sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),))

        # Phase 2 - Rewrite BAM
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.bam.reset()
        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
        
        # Phase 3 - Build Consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build()
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        # Phase 4 - Call Variants and Haplotypes
        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        # Phase 5 - Summary Statistics and Chain Files
        f_out = open(self.out, "w")
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        f_out.close()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
Esempio n. 22
0
def sort_bam(bamfilename_sorted, bamfilename_unsorted=None):
    '''Sort BAM file'''
    import pysam

    if bamfilename_unsorted is None:
        bamfilename_unsorted = bamfilename_sorted[:-11]+'.bam'

    pysam.sort(bamfilename_unsorted, bamfilename_sorted[:-4])
Esempio n. 23
0
def main():
    parser=argparse.ArgumentParser()
    parser.add_argument("-p", action='store_true', dest='is_paired_end', 
                        default=False, help=('Indicates that reads are '
                                             'paired-end (default is single).'))
    parser.add_argument("-s", action='store_true', dest='is_sorted', 
                        default=False, help=('Indicates that the input bam file'
                                             ' is coordinate sorted (default '
                                             'is False).'))
    mdefault = 100000
    mhelp = ('Changes the maximum window to search for SNPs.  The default is '
             '{:,} base pairs.  Reads or read pairs that span more than this '
             'distance (usually due to splice junctions) will be thrown out. '
             'Increasing this window allows for longer junctions, but may '
             'increase run time and memory requirements.'.format(mdefault))
    parser.add_argument("-m", action='store', dest='max_window', type=int, 
                        default=mdefault, help=mhelp)
    parser.add_argument("infile", action='store', help=("Coordinate sorted bam "
                        "file."))
    snp_dir_help = ('Directory containing the SNPs segregating within the '
                    'sample in question (which need to be checked for '
                    'mappability issues).  This directory should contain '
                    'sorted files of SNPs separated by chromosome and named: '
                    'chr<#>.snps.txt.gz. These files should contain 3 columns: '
                    'position RefAllele AltAllele')
    parser.add_argument("snp_dir", action='store', help=snp_dir_help)
    
    options = parser.parse_args()
    infile = options.infile
    snp_dir = options.snp_dir
    name_split = infile.split(".")
    
    if len(name_split) > 1:
        pref = ".".join(name_split[:-1])
    else:
        pref = name_split[0]
    
    if not options.is_sorted:
        pysam.sort(infile, pref + ".sort")
        infile = pref + ".sort"
        sort_file_name = pref + ".sort.bam"
    else:
        sort_file_name = infile

    keep_file_name = pref + ".keep.bam"
    remap_name = pref + ".to.remap.bam"
    remap_num_name = pref + ".to.remap.num.gz"

    if options.is_paired_end:
        fastq_names = [pref + ".remap.fq1.gz",
                       pref + ".remap.fq2.gz"]
    else:
        fastq_names = [pref + ".remap.fq.gz"]

    bam_data = BamScanner(options.is_paired_end, options.max_window, 
                          sort_file_name, keep_file_name, remap_name, 
                          remap_num_name, fastq_names, snp_dir)
    bam_data.run()
Esempio n. 24
0
 def sort(self):
     msg = "Sorting %s" % self.bamfn
     print(msg)
     tempfn_stem = os.path.join(self.basedir, temp_filename())
     print self.bamfn, tempfn_stem
     pysam.sort(self.bamfn, tempfn_stem)
     tempfn_glob = glob.glob(tempfn_stem + '*')
     assert len(tempfn_glob) == 1, "Unexpected number of temporary output files: %r" % tempfn_glob
     tempfn = tempfn_glob[0]
     # rename our sorted bamfn 
     os.rename(tempfn, self.bamfn)
Esempio n. 25
0
    def sort_and_rewrite_bam(self):
        
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
Esempio n. 26
0
File: prepare.py Progetto: soh-i/Ivy
 def __sort(self):
     if not os.path.isfile(bamfile):
         try:
             pysam.sort(self.samfile, self.samfile + 'sorted')
             sort_log = pysam.sort.getMessage()
             return True
         except:
             raise RuntimeError()
     else:
         print "already sorted"
         return False
Esempio n. 27
0
def bowtie_align(b_path,read,ref,s_path,bowtie2,numOfThreads,nOffrate,reftype,recovering):
    # b_path: bowtie path;
    # s_path: samtools path;
    # bowtie2: logic, true/false
    # offrate is not used due to bowtie2 bug
    
    sam=read.split("/")[-1].split(".")[0]+".sam"

    hasFile = False

    if recovering and os.path.isfile("sort_" + read.split("/")[-1].split(".")[0] + ".bam"):
        # old file exists
        try:
            align = pysam.Samfile("sort_" + read.split("/")[-1].split(".")[0] + ".bam", "rb")
            hasFile = True
        except:
            hasFile = False

    if (not recovering) or (not hasFile):
        
        print >> sys.stderr, 'Start mapping.'

        if read.split(".")[-1] in ["fa","fasta"]:   # allow fasta and fastq for read
            foption=" -f"
        else:
            foption=""

        if ref.split(".")[-1] in ["fa","fasta"]:
            base=ref.split("/")[-1].split(".")[0]
            os.system("rm "+read.split("/")[-1].strip()+".log")
            os.system(b_path+"-build "+ref+" "+base+" >> "+read+".log 2>&1")
            if not bowtie2:
                os.system(b_path+ foption+" -a --best --strata -n 1 -l 15 -e 200 -p " + str(numOfThreads) + " -S "+base+" "+read+" "+sam+" >> "+read.split("/")[-1]+".log 2>&1")
            else:
                os.system(b_path+ " -x "+base+foption+" -U "+read+ " -p " + str(numOfThreads) + " -i S,1,0.50 0R 3 -L 15 -D 20 -t " + ("-a " if reftype != "genome" else "") + "-S "+sam+" >> "+read.split("/")[-1]+".log 2>&1")
        else:
            os.system("rm "+read.split("/")[-1].strip()+".log")
            if not bowtie2:
                os.system(b_path+ foption+" -a --best --strata -n 1 -l 15 -e 200 -p " + str(numOfThreads) + " -S "+ref+" "+read+" "+sam+" >> "+read.split("/")[-1]+".log 2>&1")
            else:
                os.system(b_path+ " -x "+ref+foption+" -U "+read + " -p " + str(numOfThreads) + " -L 15 -D 20 -t " + ("-a " if reftype != "genome" else "") + "-S "+sam+" >> "+read.split("/")[-1]+".log 2>&1")
        bam=read.split("/")[-1].split(".")[0]+".bam"
        os.system(s_path+ " view -Sb -o "+bam +" "+sam)
        os.system("rm "+sam)
        pysam.sort("-n",bam,"temp")
        align=pysam.Samfile("temp.bam","rb")
        os.system("rm temp.bam")
        os.system(s_path+ " sort "+bam+ " "+"sort_"+read.split("/")[-1].split(".")[0])
        os.system("rm "+bam)
        print >> sys.stderr, 'Mapping completed.'
    else:
        print >> sys.stderr, 'Old file exists, recovery in process.'
    return align
Esempio n. 28
0
def SAM_to_BAM(samfile_name, bamfile_name):
    '''Converts a SAM file into an ordered and indexed BAM file.'''
    unsortedbamfile_name = samfile_name[:-4] + "_unsorted.bam"

    bamfile = open(unsortedbamfile_name, "wb")
    bamfile.write(pysam.view("-b", "-S", samfile_name))
    bamfile.close()

    if bamfile_name.endswith(".bam"):
        bamfile_name = bamfile_name[:-4]
    pysam.sort(unsortedbamfile_name, bamfile_name)
    pysam.index(bamfile_name + ".bam")
Esempio n. 29
0
def main():
    args = parser.parse_args()
    samfile = pysam.Samfile(args.bam, 'rb')
    junctionreads = pysam.Samfile(args.out_bam, 'wb', template=samfile)
    id_tag = args.group_on
    chosen_feature = args.feature
    if args.cufflinks:
        gff = GFFReader(args.gff, preset='cufflinks')
    else:
        gff = GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'})
    written = set([])
    for feature_name, feature in gff.get_features():
        try:
            children = feature.children
        except AttributeError:
            continue
        if len(children) > 1:
            starts = dict([(j.start, j) for i,v in children.iteritems() for j in v.parts()])
            if len(starts) > 1:
                parts = [(v.seqid, v.start, v.end) for i,v in starts.iteritems()]
                parts.sort(key=lambda x: x[1])
                for ri, read in enumerate(parts[:-1]):
                    read2 = parts[ri+1]
                    reads = set([])
                    reads2 = set([])
                    read_dict = {}
                    try:
                        for i in samfile.fetch(read[0], int(read[2])-1, read[2]):
                            if not i.overlap(int(read[2])-1, int(read[2])) or i.qname in written:
                                continue
                            reads.add(i.qname)
                            read_dict[i.qname] = i
                            # if not i.mate_is_unmapped:
                            #     mate = samfile.mate(i)
                            #     reads.add(mate.qname)
                            #     read_dict[mate.qname] = mate
                        for i in samfile.fetch(read2[0], read2[1], int(read2[1])+1):
                            if not i.overlap(int(read2[2])-1, int(read2[2])) or i.qname in written:
                                continue
                            reads2.add(i.qname)
                            read_dict[i.qname] = i
                            # if not i.mate_is_unmapped:
                            #     mate = samfile.mate(i)
                            #     reads2.add(mate.qname)
                            #     read_dict[mate.qname] = mate
                        for i in reads&reads2:
                            written.add(i)
                            junctionreads.write(read_dict[i])
                    except ValueError:
                        continue
    pysam.sort(args.out_bam, '%s_sort'%args.out_bam)
    pysam.index('%s_sort.bam'%args.out_bam)
Esempio n. 30
0
 def run(self):
     AbstractAnalysis.run(self) #Call base method to do some logging
     emptyQual = False
     for entry in samIterator(pysam.Samfile(self.samFile, "r")):
         if entry.qual is None:
             emptyQual = True
     if emptyQual is False:
         localBamFile = os.path.join(self.getLocalTempDir(), "mapping.bam")
         localSortedBamFile = os.path.join(self.getLocalTempDir(), "mapping.sorted")
         samToBamFile(self.samFile, localBamFile)
         pysam.sort(localBamFile, localSortedBamFile)
         system("qualimap bamqc -bam %s -outdir %s" % (localSortedBamFile + ".bam", self.outputDir))
     self.finish()
Esempio n. 31
0
def map_reads(pe1_path, pe2_path, genome_path, output_path):
    work_dir = tempfile.mkdtemp()
    genome_db = os.path.join(work_dir, "genome")
    pe1_output = os.path.join(work_dir, "pe1.sai")
    pe2_output = os.path.join(work_dir, "pe2.sai")
    bwa_output = os.path.join(work_dir, "output.sam")

    subprocess.call(["bwa", "index", "-p", genome_db, genome_path])
    with open(pe1_output, "w") as pe1_file:
        subprocess.call(["bwa", "aln", genome_db, pe1_path], stdout=pe1_file)

    with open(pe2_output, "w") as pe2_file:
        subprocess.call(["bwa", "aln", genome_db, pe2_path], stdout=pe2_file)

    with open(bwa_output, "w") as bwa_file:
        subprocess.call([
            "bwa", "sampe", "-r",
            "@RG\tID:ILLUMINA\tSM:48_2\tPL:ILLUMINA\tLB:LIB1", genome_db,
            pe1_output, pe2_output, pe1_path, pe2_path
        ],
                        stdout=bwa_file)

    sam_to_bam(bwa_output, bwa_output + ".bam")
    pysam.sort(bwa_output + ".bam", output_path)
Esempio n. 32
0
def bowtie2(bam_trimmed: str, fasta: str, sam_aligned: str) -> str:
	"""Sort BAM using Picard then align to reference using bowtie2"""
	if not sam_aligned.endswith("_aligned.sam"):
		log.error("sam_aligned must end in '_aligned.sam'.")
	bam_sorted = bam_trimmed.replace("_trimmed.bam", "_sorted.bam")
	pysam.sort("-n", "-o", bam_sorted, bam_trimmed) # Sort by QNAME
	pysam.index(bam_sorted)

	index_prefix = os.path.basename(fasta.split('.')[0])
	index_prefix = os.path.join(os.path.dirname(sam_aligned), index_prefix)
	command = f"bowtie2-build -f {fasta} --threads 8 {index_prefix}"
	log.sep()
	execute(command)
	command = \
		f"""
		bowtie2 -a --very-sensitive-local --norc \
			-x {index_prefix} \
			-p 8 \
			-b {bam_sorted} \
			-S {sam_aligned} \
		"""
	execute(command)
	log.sep()
	return sam_aligned
Esempio n. 33
0
def _compress_bam(bam_input, bam_output, ref_fname, regions=None, threads=1):
    """Compress a bam into run length encoding (RLE).

    :param bam_input: str, name of the bam file to be compressed
    :param bam_output: str, name of the bam to be produced
    :param ref_fname: str, reference filename, used to produce bam_input
    :param regions: list, genomic regions to be extracted
    :param threads: int, number of workers to be used

    :returns: None
    """
    regions = medaka.common.get_regions(bam_input, regions)
    ref_fasta = pysam.FastaFile(ref_fname)

    with pysam.AlignmentFile(bam_input, 'r') as alignments_bam:
        tmp_output = '{}.tmp'.format(bam_output)
        with pysam.AlignmentFile(tmp_output,
                                 'wb',
                                 header=alignments_bam.header) as output:
            for region in regions:
                bam_current = alignments_bam.fetch(reference=region.ref_name,
                                                   start=region.start,
                                                   end=region.end)
                ref_sequence = ref_fasta.fetch(region.ref_name)
                ref_rle = RLEConverter(ref_sequence)
                func = functools.partial(_compress_alignment, ref_rle=ref_rle)
                with concurrent.futures.ThreadPoolExecutor(
                        max_workers=threads) as executor:
                    for chunk in medaka.common.grouper(bam_current, 100):
                        for new_alignment in executor.map(func, chunk):
                            if new_alignment is not None:
                                output.write(new_alignment)

        pysam.sort("-o", bam_output, tmp_output)
        os.remove(tmp_output)
        pysam.index(bam_output)
Esempio n. 34
0
def bam_diff(f1, f2, T_TEST_DIR):
    basename, ext = os.path.splitext(os.path.basename(f1))

    f1sorted = T_TEST_DIR + basename + '.f1.sorted.bam'
    f2sorted = T_TEST_DIR + basename + '.f2.sorted.bam'

    pysam.sort(f1, '-n', '-o', f1sorted)
    pysam.sort(f2, '-n', '-o', f2sorted)

    f1sam = T_TEST_DIR + basename + '.f1.sam'
    f2sam = T_TEST_DIR + basename + '.f2.sam'

    fhq = open(f1sam, "w")
    fhq.write(pysam.view('-h', f1sorted))
    fhq.close()

    fhq = open(f2sam, "w")
    fhq.write(pysam.view('-h', f2sorted))
    fhq.close()

    subprocess.Popen([
        'sed', '-i', '-r',
        's@(SA:[^\\t]+)\\t(LB:[^\\t]+)\t(RG:[^\\t]+)@\\3\\t\\1\\t\\2@', f2sam
    ],
                     stdout=subprocess.PIPE).stdout.read()

    subprocess.Popen(['sed', '-i', '-r', 's@\\tFI:i:[0-9]+@@', f1sam],
                     stdout=subprocess.PIPE).stdout.read()
    subprocess.Popen(['sed', '-i', '-r', 's@\\tFI:i:[0-9]+@@', f2sam],
                     stdout=subprocess.PIPE).stdout.read()

    # one time only
    # subprocess.Popen(['sed', '-i' , '-r', 's@\\tSA:Z:[^\\t]+@@', f1sam], stdout=subprocess.PIPE).stdout.read()
    # subprocess.Popen(['sed', '-i' , '-r', 's@\\tSA:Z:[^\\t]+@@', f2sam], stdout=subprocess.PIPE).stdout.read()

    return filecmp.cmp(f1sam, f2sam), f1sam, f2sam
Esempio n. 35
0
def preprocess_sam(sam_files, datasets, tmp_dir, n_threads = 0):
    """ Copy and rename the provided SAM/BAM file(s), merge them, and index.
        This is necessary in order to use Pybedtools commands on the reads.
        The renaming is necessary in order to label the reads according to
        their dataset."""

    # Create the tmp dir
    os.system("mkdir -p %s " % (tmp_dir))

    # Copy and rename SAM files with dataset names to ensure correct RG tags
    renamed_sams = []
    for sam, dataset in zip(sam_files, datasets):
        suffix = "." + sam.split(".")[-1]
        if suffix == ".sam":
            bam_copy = tmp_dir + dataset + "_unsorted.bam"
            convert_to_bam(sam, bam_copy)
            sam = bam_copy
        sorted_bam = tmp_dir + dataset + ".bam"
        pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam)
        renamed_sams.append(sorted_bam)

    merged_bam = tmp_dir + "merged.bam"
    merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)]
    # index_args = [merged_bam, "-@", str(n_threads)]

    # Merge datasets and use -r option to include a read group tag
    try:
        pysam.merge(*merge_args)
        pysam.index(merged_bam)
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        print("[ %s ] Merged input SAM/BAM files" % (ts))
    except:
        raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
                            "Check your file paths and make sure that all "
                            "files have headers."))
    return merged_bam
Esempio n. 36
0
def bam_merge(bam_ins, bam_out):
    """
    merge multiple bam files
    input: list of bam files
    input: out.bam
    """
    # check input files
    bam_flag = []
    for b in bam_ins:
        if not os.path.exists(b) is True:
            bam_flag.append(b)
    if len(bam_flag) > 0:
        sys.exit('BAM files not exists:' + '\n'.join(bam_flag))
    # check output file
    if os.path.exists(bam_out) is True:
        pass
        # sys.exit('BAM exists:' + bam_out)
    else:
        # merge
        pysam.merge('-f', bam_out + '.unsorted.bam',
                    *bam_ins)  # overwrite output BAM
        pysam.sort('-o', bam_out, bam_out + '.unsorted.bam')
        pysam.index(bam_out)
        os.remove(bam_out + '.unsorted.bam')
Esempio n. 37
0
def main(opts):
    """
    Main function open samfile, collect the correct header and write output as bam file.
    A sorted Bamfile is also outputted.
    Function also flags the XA marked regions in the samfile,
    which can then be excluded in the variant calling.
    """

    samfile = pysam.Samfile(opts.sam_file, 'r')
    new_header = samfile.header.copy()

    final_header = complete_header(opts.read_group_tags, samfile, new_header)

    # "wb" here means write as bam file.
    if not opts.output.endswith(".bam"):
        opts.output = opts.output + ".bam"
    writer_piet = pysam.AlignmentFile(opts.output, "wb", header=final_header)

    for read in samfile:
        tags = dict(read.tags)
        if "XA" in tags:
            read.is_qcfail = True
        writer_piet.write(read)

    writer_piet.close()
    samfile.close()

    print("Bam file created, continuing to sort bam file....")

    path_and_name = os.path.split(opts.output)
    path_and_name = list(path_and_name)

    sorted_bam_out = os.path.join(path_and_name[0],
                                  "sorted_" + path_and_name[1])
    pysam.sort("-o", sorted_bam_out, opts.output)
    print("Done, Sorted bam file created")
Esempio n. 38
0
def indexBamFile():  ## indexing bam files to use pysam
    logging.debug("indexing BAM File function..")
    bamFile = 0
    ## currently not supporting bam file input
    for rr in range(0, len(sample_1)):  ## for each replicate of sample_1
        rTempFolder = s1rPath + str(rr + 1)
        bam_fn = ''
        if bamFile == 0:  ## we know the location of the bam file
            bam_fn = rTempFolder + '/Aligned.sortedByCoord.out.bam'
        else:  ## bam file is provided
            bam_fn = sample_1[rr]

        if LooseVersion(
                pysam.version.__samtools_version__) < LooseVersion('1.3'):
            pysam.sort(bam_fn, rTempFolder + '/aligned.sorted')
            ## it will make aligned.sorted.bam file
            pysam.index(rTempFolder + '/aligned.sorted.bam')
            ## it will make aligned.sorted.bam.bai file
        else:
            pysam.sort(bam_fn, '-o', rTempFolder + '/aligned.sorted.bam')
            ## it will make aligned.sorted.bam file
            pysam.index(rTempFolder + '/aligned.sorted.bam')
            ## it will make aligned.sorted.bam.bai file

    for rr in range(0, len(sample_2)):  ## for each replicate of sample_2
        rTempFolder = s2rPath + str(rr + 1)
        bam_fn = ''
        if bamFile == 0:  ## we know the location of the bam file
            bam_fn = rTempFolder + '/Aligned.sortedByCoord.out.bam'
        else:  ## bam file is provided
            bam_fn = sample_2[rr]

        if LooseVersion(
                pysam.version.__samtools_version__) < LooseVersion('1.3'):
            pysam.sort(bam_fn, rTempFolder + '/aligned.sorted')
            ## it will make aligned.sorted.bam file
            pysam.index(rTempFolder + '/aligned.sorted.bam')
            ## it will make aligned.sorted.bam.bai file
        else:
            pysam.sort(bam_fn, '-o', rTempFolder + '/aligned.sorted.bam')
            ## it will make aligned.sorted.bam file
            pysam.index(rTempFolder + '/aligned.sorted.bam')
Esempio n. 39
0
    def merge_bam(self, data_dir, project_id, final_id, run_ids=[]):
        """
        Merge together all the bams in a directory and sort to create the final
        bam ready to be filtered
        
        If run_ids is blank then the function looks for all bam files in the
        data_dir
        """
        out_bam_file = data_dir + project_id + '/' + final_id + '.bam'

        if len(run_ids) == 0:
            bam_files = [
                f for f in listdir(data_dir + project_id)
                if f.endswith(("sai"))
            ]
        else:
            bam_files = [f + ".bam" for f in run_ids]

        bam_sort_files = []
        bam_merge_files = []
        for bam in bam_files:
            bam_loc = data_dir + project_id + '/' + bam
            bam_sort_files.append(bam_loc)
            bam_merge_files.append(bam_loc)

        for bam_sort_file in bam_sort_files:
            print bam_sort_file
            pysam.sort("-o", str(bam_sort_file), str(bam_sort_file))

        if len(bam_sort_files) == 1:
            pysam.sort("-o", str(out_bam_file), str(bam_sort_files[0]))
        else:
            pysam.merge(out_bam_file, *bam_merge_files)
            pysam.sort("-o", str(out_bam_file), "-T",
                       str(out_bam_file) + ".bam_sort", str(out_bam_file))

        pysam.index(str(out_bam_file))
Esempio n. 40
0
        RG_idx = 1
        for chrom in sorted(panel_regions):
            for curr_area in panel_regions[chrom]:
                new_header["RG"].append({
                    "ID": str(RG_idx),
                    args.RG_tag: curr_area.name
                })
                RG_id_by_source[curr_area.name] = str(RG_idx)
                RG_idx += 1
        # Parse reads
        with pysam.AlignmentFile(tmp_aln, "wb", header=new_header) as FH_out:
            if args.single_mode:
                log_data = processSingleReads(FH_in, panel_regions,
                                              RG_id_by_source, args)
            else:
                log_data = processPairedReads(FH_in, panel_regions,
                                              RG_id_by_source, args)

    # Sort output file
    pysam.sort("-o", args.output_aln, tmp_aln)
    pysam.index(args.output_aln)
    os.remove(tmp_aln)

    # Write summary
    if args.output_summary is not None:
        if args.summary_format == "json":
            writeJSONSummary(args.output_summary, log_data)
        else:
            writeTSVSummary(args.output_summary, log_data)
    log.info("End of job")
Esempio n. 41
0
def run_bowtie2(
      reads_fwd,
      reads_rev,
      ref_fa,
      out_prefix,
      threads=1,
      max_insert=1000,
      sort=False,
      bowtie2='bowtie2',
      bowtie2_preset='very-sensitive-local',
      bowtie2_version=None,
      verbose=False,
      verbose_filehandle=sys.stdout,
      remove_both_unmapped=False,
      clean_index=True,
    ):

    ref_is_indexed = True
    for ext in bowtie2_index_extensions:
        if not os.path.exists(ref_fa + '.' + ext):
            ref_is_indexed = False
            break

    clean_files = []

    if ref_is_indexed:
        if verbose:
            print('Bowtie2 index files found (', ref_fa, '.*.bt2) so no need to index', sep='', file=verbose_filehandle)
        map_index = ref_fa
    else:
        map_index = out_prefix + '.map_index'
        bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle)

        if clean_index:
            clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]

    final_bam = out_prefix + '.bam'
    if sort:
        intermediate_bam = out_prefix + '.unsorted.bam'
    else:
        intermediate_bam = final_bam

    map_cmd = [
        bowtie2,
        '--threads', str(threads),
        '--reorder',
        '--' + bowtie2_preset,
        '-X', str(max_insert),
        '-x', map_index,
        '-1', reads_fwd,
        '-2', reads_rev,
    ]

    if LooseVersion(bowtie2_version) >= LooseVersion('2.3.1'):
        map_cmd.append('--score-min G,1,10')

    # We use gawk instead of awk here as we need bitwise comparisons
    # and these are not available via awk on Mac OSX.
    if remove_both_unmapped:
        map_cmd.append(r''' | gawk ' !(and($2,4)) || !(and($2,8)) ' ''')

    tmp_sam_file = out_prefix + '.unsorted.sam'
    map_cmd.append(' > ' + tmp_sam_file)
    map_cmd = ' '.join(map_cmd)

    common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)

    if verbose:
        print('Converting', tmp_sam_file, '->', intermediate_bam, file=verbose_filehandle)
    infile = pysam.AlignmentFile(tmp_sam_file, "r")
    outfile = pysam.AlignmentFile(intermediate_bam, "wb", template=infile)
    for x in infile:
        outfile.write(x)
    infile.close()
    outfile.close()
    os.unlink(tmp_sam_file)

    if sort:
        if verbose:
            print('Sorting', intermediate_bam, '->', final_bam, file=verbose_filehandle)
        pysam.sort('-o', final_bam, '-O', 'BAM', intermediate_bam)
        if verbose:
            print('Indexing', final_bam, file=verbose_filehandle)
        pysam.index(final_bam)
        clean_files.append(intermediate_bam)

    for fname in clean_files:
        os.unlink(fname)
        # Write to bam
        outf.write(read)

reader.close()

del ra
import gc
gc.collect()

# Remove bam file to save space
#os.remove(in_bam)

# Sort tagged file
print('Sorting bam file...')
sorted_file = prefix + '_Aligned.tagged.sorted.bam'
pysam.sort("-o", sorted_file, '--threads', str(no_cores), out_bam)
# Remove unsorted file
#os.remove(out_bam)

# Index
args = f'samtools index {sorted_file}'
with subprocess.Popen(args, shell=True) as p:
    out, err = p.communicate()

# RNA velocity
print('Computing RNA velocity...')
out_dir = sys.argv[2]  #f'{prefix}/velocity'
args = f'velocyto run -b /data/peer/chanj3/SCPC_transformation/ref/737K-august-2016.txt \
-o {out_dir} -@ {no_cores} -v {sorted_file} /data/peer/chanj3/SCPC_transformation/ref/annotations.gtf'

with subprocess.Popen(args, shell=True) as p:
Esempio n. 43
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=usage,
                            description=globals()["__doc__"])
    group = U.OptionGroup(parser, "dedup-specific options")

    group.add_option("--output-stats",
                     dest="stats",
                     type="string",
                     default=False,
                     help="Specify location to output stats")

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options, group=False)

    if options.random_seed:
        np.random.seed(options.random_seed)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = options.stdout.name
        options.stdout.close()
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename(dir=options.tmpdir)
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    if options.stats and options.ignore_umi:
        raise ValueError("'--output-stats' and '--ignore-umi' options"
                         " cannot be used together")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    if options.paired:
        outfile = sam_methods.TwoPassPairWriter(infile, outfile)

    nInput, nOutput, input_reads, output_reads = 0, 0, 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)

    else:
        if options.per_contig and options.gene_transcript_map:
            metacontig2contig = sam_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = sam_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch()

    # set up ReadCluster functor with methods specific to
    # specified options.method
    processor = network.ReadDeduplicator(options.method)

    bundle_iterator = sam_methods.get_bundles(
        options, metacontig_contig=metacontig2contig)

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = umi_methods.random_read_generator(
            infile.filename,
            chrom=options.chrom,
            barcode_getter=bundle_iterator.barcode_getter)

    for bundle, key, status in bundle_iterator(inreads):

        nInput += sum([bundle[umi]["count"] for umi in bundle])

        while nOutput >= output_reads + 100000:
            output_reads += 100000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        if options.stats:
            # generate pre-dudep stats
            average_distance = umi_methods.get_average_umi_distance(
                bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = umi_methods.get_average_umi_distance(
                random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                nOutput += 1
                outfile.write(bundle[umi]["read"])

        else:

            # dedup using umis and write out deduped bam
            reads, umis, umi_counts = processor(bundle=bundle,
                                                threshold=options.threshold)

            for read in reads:
                outfile.write(read)
                nOutput += 1

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [
                    bundle_iterator.barcode_getter(x)[0] for x in reads
                ]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = umi_methods.get_average_umi_distance(
                    post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = umi_methods.get_average_umi_distance(
                    random_umis)
                post_cluster_stats_null.append(average_distance_null)

    outfile.close()

    if not options.no_sort_output:
        # sort the output
        pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
        os.unlink(out_name)  # delete the tempfile

    if options.stats:

        # generate the stats dataframe
        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # tally the counts per umi per position
        pre_counts = collections.Counter(stats_pre_df["counts"])
        post_counts = collections.Counter(stats_post_df["counts"])
        counts_index = list(
            set(pre_counts.keys()).union(set(post_counts.keys())))
        counts_index.sort()
        with U.openFile(options.stats + "_per_umi_per_position.tsv",
                        "w") as outf:
            outf.write("counts\tinstances_pre\tinstances_post\n")
            for count in counts_index:
                values = (count, pre_counts[count], post_counts[count])
                outf.write("\t".join(map(str, values)) + "\n")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        agg_df = agg_df.fillna(0).astype(int)

        agg_df.index = [x.decode() for x in agg_df.index]
        agg_df.index.name = 'UMI'
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame(
            {
                "unique":
                tallyCounts(pre_cluster_binned, max_ed),
                "unique_null":
                tallyCounts(pre_cluster_null_binned, max_ed),
                options.method:
                tallyCounts(post_cluster_binned, max_ed),
                "%s_null" % options.method:
                tallyCounts(post_cluster_null_binned, max_ed),
                "edit_distance":
                cluster_bins
            },
            columns=[
                "unique", "unique_null", options.method,
                "%s_null" % options.method, "edit_distance"
            ])

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))

    U.info("Number of reads out: %i" % nOutput)

    if not options.ignore_umi:  # otherwise processor has not been used
        U.info("Total number of positions deduplicated: %i" %
               processor.UMIClusterer.positions)
        if processor.UMIClusterer.positions > 0:
            U.info("Mean number of unique UMIs per position: %.2f" %
                   (float(processor.UMIClusterer.total_umis_per_position) /
                    processor.UMIClusterer.positions))
            U.info("Max. number of unique UMIs per position: %i" %
                   processor.UMIClusterer.max_umis_per_position)
        else:
            U.warn("The BAM did not contain any valid "
                   "reads/read pairs for deduplication")

    U.Stop()
Esempio n. 44
0
def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(data_folder,
                                   adaID,
                                   type='bam',
                                   part=(j + 1)) for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='bam',
                                                 unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(data_folder,
                                                        adaID,
                                                        type='bam',
                                                        unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='sam',
                                                 part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')
Esempio n. 45
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" %
                 (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update(
            [line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update(
            [line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir,
                                   config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir,
                                       config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir,
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(
        open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 -
                           runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Process and inspect the FASTQ files, performing several alterations
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    #
    converted_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES
    ]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(
        up_to_date(cfq, fq)
        for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files,
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir,
                                          config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(
        tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(
            up_to_date(transcriptome_bam_file, fq)
            for fq in converted_fastq_files) and all(
                up_to_date(a, b)
                for a, b in zip(transcriptome_unaligned_fastq_files,
                                converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(
            transcriptome_index=transcriptome_index,
            genome_index=genome_index,
            transcript_file=transcript_file,
            fastq_files=converted_fastq_files,
            unaligned_path=transcriptome_unaligned_path,
            bam_file=transcriptome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_transcriptome_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(
        runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(
            sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file,
                   sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file,
                   sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(
            open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(
            bamfh,
            transcripts,
            min_isize=min_fragment_length,
            max_isize=runconfig.max_fragment_length,
            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" %
                            (runconfig.isize_mean, runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(
                runconfig.isize_mean,
                runconfig.isize_stdev,
                min_isize=runconfig.min_fragment_length,
                max_isize=runconfig.max_fragment_length,
                samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(
        DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" %
                  (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" %
                  (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug(
        "\tAfter adjusting for min %d and read length %d, final segment length is %d"
        % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug(
            "\tOverriding auto segment length and using segment length of %d" %
            (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files)
            and all(
                up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files,
                                                 converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(
            index=genome_index,
            fastq_files=transcriptome_unaligned_fastq_files,
            unaligned_path=genome_unaligned_path,
            bam_file=genome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(
            up_to_date(realigned_bam_file, fq)
            for fq in genome_unaligned_fastq_files)
            and up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(
            transcripts=transcripts,
            input_bam_file=realigned_bam_file,
            paired_bam_file=paired_bam_file,
            discordant_bam_file=discordant_bam_file,
            unpaired_bam_file=unpaired_bam_file,
            unmapped_bam_file=unmapped_bam_file,
            multimap_bam_file=multimap_bam_file,
            unresolved_bam_file=unresolved_bam_file,
            max_isize=runconfig.max_fragment_length,
            max_multihits=runconfig.max_multihits,
            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(
        tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        discordant_genome_sam_file = os.path.join(
            tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(
            genome_index,
            transcripts,
            input_file=discordant_bam_file,
            output_file=discordant_genome_sam_file,
            library_type=runconfig.library_type,
            input_sam=False,
            output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file,
                             discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file,
                   discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file,
                   sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir,
                                            config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        unpaired_genome_sam_file = os.path.join(
            tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index,
                                          transcripts,
                                          input_file=unpaired_bam_file,
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file,
                             unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file,
                   sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir,
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            concordant_bam_file=sorted_transcriptome_bam_file,
            output_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_file=cluster_file,
            cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file, )
    if up_to_date(cluster_pair_file,
                  sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(
            discordant_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_pair_file=cluster_pair_file,
            tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file, cluster_shelve_file,
                   cluster_pair_file)
    output_files = (breakpoint_bam_file, )
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(
            index_dir=runconfig.index_dir,
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            breakpoint_bam_file=breakpoint_bam_file,
            log_dir=log_dir,
            tmp_dir=tmp_dir,
            num_processors=runconfig.num_processors,
            local_anchor_length=runconfig.local_anchor_length,
            local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(
        tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file)
    output_files = (spanning_bam_file, spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            bam_file=breakpoint_bam_file,
            output_sam_file=spanning_sam_file,
            output_cluster_pair_file=spanning_cluster_pair_file,
            local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir,
                                            config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    #
    unfiltered_chimera_bedpe_file = os.path.join(
        runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (
        unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file)
            and up_to_date(unfiltered_chimera_bedpe_file,
                           cluster_shelve_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts,
                               cluster_shelve_file=cluster_shelve_file,
                               cluster_pair_file=spanning_cluster_pair_file,
                               read_name_file=read_name_file,
                               output_file=unfiltered_chimera_bedpe_file,
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                      config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = filter_chimeras(
            input_file=unfiltered_chimera_bedpe_file,
            output_file=chimera_bedpe_file,
            filter_num_frags=runconfig.filter_num_frags,
            filter_allele_fraction=runconfig.filter_allele_fraction,
            mask_biotypes=mask_biotypes,
            mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    #
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run.")
    return config.JOB_SUCCESS
Esempio n. 46
0
        # os.remove("hisat2.G2A.sam")

    unique_num = C2T_num + G2A_num
    total_reads = unique_num + multimapper_num + unmapped_num
    sys.stderr.write("[%s]Completed successfully:\n" %
                     strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    sys.stderr.write(" Total reads: %d\n" % total_reads)
    sys.stderr.write(" Unique mapping: %d (%.3f%%)\n" %
                     (unique_num, 100 * unique_num / (total_reads + 0.0)))
    sys.stderr.write("   C2T: %d (%.2f%%)\n" % (C2T_num, 100 * C2T_num /
                                                (total_reads + 0.0)))
    sys.stderr.write("   G2A: %d (%.2f%%)\n" % (G2A_num, 100 * G2A_num /
                                                (total_reads + 0.0)))
    sys.stderr.write(" Multiple mapping: %d (%.3f%%)\n" %
                     (multimapper_num, 100 * multimapper_num /
                      (total_reads + 0.0)))
    sys.stderr.write(" Unmapped: %d (%.3f%%)\n" %
                     (unmapped_num, 100 * unmapped_num / (total_reads + 0.0)))

    if options.sorted_bam == True:
        sys.stderr.write("[%s]Sorting bam...\n" %
                         strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        pysam.sort("-o", options.output + ".sorted.bam",
                   options.output + ".bam")
        if options.no_sorted_bam_index == False:
            sys.stderr.write("[%s]Indexing bam...\n" %
                             strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            pysam.index(options.output + ".sorted.bam")

    if options.del_bam == True:
        os.remove(options.output + ".bam")
Esempio n. 47
0
def main(args):
    numhum = nummou = numamb = 0
    #starttime = time.clock()
    # parse inputs
    humanfilename = args.A
    mousefilename = args.B
    samplenameprefix = args.prefix
    outputdir = args.output_dir
    intermdir = args.intermediate_dir
    disablesort = args.no_sort
    disambalgo = args.aligner
    supportedalgorithms = set(['tophat', 'bwa', 'star'])

    # check existence of input BAM files
    if not (file_exists(humanfilename) and file_exists(mousefilename)):
        sys.stderr.write("\nERROR in disambiguate.py: Two existing input BAM files "
                         "must be specified as positional arguments\n")
        sys.exit(2)
    if len(samplenameprefix) < 1:
        humanprefix = path.basename(humanfilename.replace(".bam",""))
        mouseprefix = path.basename(mousefilename.replace(".bam",""))
    else:
        if samplenameprefix.endswith(".bam"):
            samplenameprefix = samplenameprefix[0:samplenameprefix.rfind(".bam")] # the above if is not stricly necessary for this to work
        humanprefix = samplenameprefix
        mouseprefix = samplenameprefix
    samplenameprefix = None # clear variable
    if disambalgo.lower() not in supportedalgorithms:
        print(disambalgo+" is not a supported disambiguation scheme at the moment.")
        sys.exit(2)

    if disablesort:
        humanfilenamesorted = humanfilename # assumed to be sorted externally...
        mousefilenamesorted = mousefilename # assumed to be sorted externally...
    else:
        if not path.isdir(intermdir):
            makedirs(intermdir)
        humanfilenamesorted = path.join(intermdir,humanprefix+".speciesA.namesorted.bam")
        mousefilenamesorted = path.join(intermdir,mouseprefix+".speciesB.namesorted.bam")
        if not path.isfile(humanfilenamesorted):
            pysam.sort("-n","-m","2000000000",humanfilename,humanfilenamesorted.replace(".bam",""))
        if not path.isfile(mousefilenamesorted):
            pysam.sort("-n","-m","2000000000",mousefilename,mousefilenamesorted.replace(".bam",""))
   # read in human reads and form a dictionary
    myHumanFile = pysam.Samfile(humanfilenamesorted, "rb" )
    myMouseFile = pysam.Samfile(mousefilenamesorted, "rb" )
    if not path.isdir(outputdir):
        makedirs(outputdir)
    myHumanUniqueFile = pysam.Samfile(path.join(outputdir, humanprefix+".disambiguatedSpeciesA.bam"), "wb", template=myHumanFile)
    myHumanAmbiguousFile = pysam.Samfile(path.join(outputdir, humanprefix+".ambiguousSpeciesA.bam"), "wb", template=myHumanFile)
    myMouseUniqueFile = pysam.Samfile(path.join(outputdir, mouseprefix+".disambiguatedSpeciesB.bam"), "wb", template=myMouseFile)
    myMouseAmbiguousFile = pysam.Samfile(path.join(outputdir, mouseprefix+".ambiguousSpeciesB.bam"), "wb", template=myMouseFile)
    summaryFile = open(path.join(outputdir,humanprefix+'_summary.txt'),'w')

    #initialise
    try:
        nexthumread=myHumanFile.next()
        nextmouread=myMouseFile.next()
    except StopIteration:
        print("No reads in one or either of the input files")
        sys.exit(2)

    EOFmouse = EOFhuman = False
    prevHumID = '-+=RANDOMSTRING=+-'
    prevMouID = '-+=RANDOMSTRING=+-'
    while not EOFmouse&EOFhuman:
        while not (nat_cmp(nexthumread.qname,nextmouread.qname) == 0):
            # check order between current human and mouse qname (find a point where they're identical, i.e. in sync)
            while nat_cmp(nexthumread.qname,nextmouread.qname) > 0 and not EOFmouse: # mouse is "behind" human, output to mouse disambiguous
                myMouseUniqueFile.write(nextmouread)
                if not nextmouread.qname == prevMouID:
                    nummou+=1 # increment mouse counter for unique only
                prevMouID = nextmouread.qname
                try:
                    nextmouread=myMouseFile.next()
                except StopIteration:
                    EOFmouse=True
            while nat_cmp(nexthumread.qname,nextmouread.qname) < 0 and not EOFhuman: # human is "behind" mouse, output to human disambiguous
                myHumanUniqueFile.write(nexthumread)
                if not nexthumread.qname == prevHumID:
                    numhum+=1 # increment human counter for unique only
                prevHumID = nexthumread.qname
                try:
                    nexthumread=myHumanFile.next()
                except StopIteration:
                    EOFhuman=True
            if EOFhuman or EOFmouse:
                break
        # at this point the read qnames are identical and/or we've reached EOF
        humlist = list()
        moulist = list()
        if nat_cmp(nexthumread.qname,nextmouread.qname) == 0:
            humlist.append(nexthumread)
            nexthumread = read_next_reads(myHumanFile, humlist) # read more reads with same qname (the function modifies humlist directly)
            if nexthumread == None:
                EOFhuman = True
            moulist.append(nextmouread)
            nextmouread = read_next_reads(myMouseFile, moulist) # read more reads with same qname (the function modifies moulist directly)
            if nextmouread == None:
                EOFmouse = True

        # perform comparison to check mouse, human or ambiguous
        if len(moulist) > 0 and len(humlist) > 0:
            myAmbiguousness = disambiguate(humlist, moulist, disambalgo)
            if myAmbiguousness < 0: # mouse
                nummou+=1 # increment mouse counter
                for myRead in moulist:
                    myMouseUniqueFile.write(myRead)
            elif myAmbiguousness > 0: # human
                numhum+=1 # increment human counter
                for myRead in humlist:
                    myHumanUniqueFile.write(myRead)
            else: # ambiguous
                numamb+=1 # increment ambiguous counter
                for myRead in moulist:
                    myMouseAmbiguousFile.write(myRead)
                for myRead in humlist:
                    myHumanAmbiguousFile.write(myRead)
        if EOFhuman:
            #flush the rest of the mouse reads
            while not EOFmouse:
                myMouseUniqueFile.write(nextmouread)
                if not nextmouread.qname == prevMouID:
                    nummou+=1 # increment mouse counter for unique only
                prevMouID = nextmouread.qname
                try:
                    nextmouread=myMouseFile.next()
                except StopIteration:
                    #print("3")
                    EOFmouse=True
        if EOFmouse:
            #flush the rest of the human reads
            while not EOFhuman:
                myHumanUniqueFile.write(nexthumread)
                if not nexthumread.qname == prevHumID:
                    numhum+=1 # increment human counter for unique only
                prevHumID = nexthumread.qname
                try:
                    nexthumread=myHumanFile.next()
                except StopIteration:
                    EOFhuman=True

    summaryFile.write("sample\tunique species A pairs\tunique species B pairs\tambiguous pairs\n")
    summaryFile.write(humanprefix+"\t"+str(numhum)+"\t"+str(nummou)+"\t"+str(numamb)+"\n")
    summaryFile.close()
    myHumanFile.close()
    myMouseFile.close()
    myHumanUniqueFile.close()
    myHumanAmbiguousFile.close()
    myMouseUniqueFile.close()
    myMouseAmbiguousFile.close()
Esempio n. 48
0
def filter_reads(bam,
                 positions,
                 fasta_length,
                 filter_cutoff=0.97,
                 max_insert_relative=3,
                 min_insert=50,
                 min_mapq=2,
                 write_data=None,
                 write_bam=False):

    # read sets
    observed_read1s = set()
    observed_read2s = set()
    mapped_pairs = set()
    final_reads = set()

    # counters
    total_read_count = 0
    total_read_pairs = 0
    total_mapped_pairs = 0
    mapped_read_lengths = 0

    # storing data
    read_data = {}
    pair_mapqs = {}
    pair_mismatch = {}
    pair_inserts = {}

    samfile = pysam.AlignmentFile(bam)

    #for printing out a new bam file
    if write_bam:
        logging.info("Copying header for new bam...")
        samfile_out = pysam.AlignmentFile(bam.split("/")[-1].split(".")[0] +
                                          "_filtered.bam",
                                          "wb",
                                          template=samfile)
        reads_all = defaultdict(list)

    logging.info("READING BAM: " + bam.split("/")[-1])
    logging.info("Using reads with >" + str(filter_cutoff) +
                 "% PID to consensus reference.")

    ## STEP 1: collect paired reads and their information
    for gene in tqdm(positions, desc='Getting read pairs: '):
        for read in samfile.fetch(gene[0], gene[1], gene[2]):
            total_read_count += 1

            #store all reads if we're going to write them back to a new bam file
            if write_bam:
                reads_all[read.query_name].append(read)

            ## If we've seen this read's pair before
            if (read.is_read2 and read.query_name in observed_read1s) or (
                    read.is_read1 and read.query_name in observed_read2s):

                #But if we haven't already seen this complete pair, then we can complete the pair and store the information
                #Also check that the pair is on the same scaffold
                if read.query_name not in mapped_pairs and gene[
                        0] == read_data[read.query_name]['scaf']:
                    total_read_pairs += 1

                    if read.get_reference_positions() != []:
                        total_mapped_pairs += 1
                        mapped_pairs.add(read.query_name)  #add to found

                        #for calculating mean read length
                        mapped_read_lengths += float(
                            read_data[read.query_name]['len'])

                        #set mismatch percentage
                        pair_mismatch[read.query_name] = 1 - (
                            (float(read_data[read.query_name]['nm']) +
                             float(read.get_tag('NM'))) /
                            (float(read_data[read.query_name]['len']) +
                             read.infer_query_length()))
                        #set insert size
                        if read.get_reference_positions()[-1] > read_data[
                                read.query_name]['start']:
                            pair_inserts[
                                read.
                                query_name] = read.get_reference_positions(
                                )[-1] - read_data[read.query_name]['start']
                        else:
                            pair_inserts[
                                read.query_name] = read_data[read.query_name][
                                    'stop'] - read.get_reference_positions()[0]
                        #set mapq
                        pair_mapqs[read.query_name] = read.mapping_quality
                        if read_data[read.query_name][
                                'mapq'] > read.mapping_quality:
                            pair_mapqs[read.query_name] = read_data[
                                read.query_name]['mapq']

            #this is the first time we see a read from this pair and don't double count
            elif (read.is_read1 and read.query_name
                  not in observed_read1s) or (read.is_read2 and read.query_name
                                              not in observed_read2s):
                if read.get_reference_positions(
                ) != []:  # don't use unmapped reads
                    if read.is_read1:
                        observed_read1s.add(read.query_name)
                    else:
                        observed_read2s.add(read.query_name)
                    #record the data for this read
                    read_data[read.query_name] = {
                        "nm": read.get_tag('NM'),
                        "len": read.infer_query_length(),
                        "mapq": read.mapping_quality,
                        "start": read.get_reference_positions()[0],
                        'stop': read.get_reference_positions()[-1],
                        'scaf': gene[0]
                    }

    ## STEP 2: INSERT SIZE CUTOFF, MAPQ CUTOFF, AND MISMATCH CUTOFF
    mapped_read_lengths = mapped_read_lengths / total_mapped_pairs

    max_insert = np.median(
        list(pair_inserts.values())
    ) * max_insert_relative  #insert size should be less than max_insert_relative * median value
    too_short = 0.0
    too_long = 0.0
    good_length = 0.0
    mapq_good = 0.0
    filter_cutoff_good = 0.0

    logging.info("Filtering reads...")

    for read_pair in mapped_pairs:
        if pair_inserts[read_pair] > min_insert:
            if pair_inserts[read_pair] < max_insert:
                good_length += 2
                if pair_mapqs[read_pair] > min_mapq:
                    mapq_good += 2

                    # Which set does this read go into?
                    if pair_mismatch[read_pair] > filter_cutoff:
                        filter_cutoff_good += 2
                        final_reads.add(read_pair)

                        #write out to new bam file if option selected
                        if write_bam:
                            for read in reads_all[read_pair]:
                                samfile_out.write(read)
            else:
                too_long += 2
        else:
            too_short += 2

    table = defaultdict(list)
    table["total reads found"].append(str(total_read_count))
    table["average mapped read length"].append(str(mapped_read_lengths))
    table["total fasta length"].append(str(fasta_length))
    table["expected possible coverage"].append(
        str(float(total_read_count) * mapped_read_lengths / fasta_length))
    table["total paired reads"].append(str(total_read_pairs * 2))
    table["total paired reads (%)"].append(
        str(int(100 * total_read_pairs * 2.0 / total_read_count)))
    table["total same scaffold mapped paired reads"].append(
        str(total_mapped_pairs * 2))
    table["total same scaffold mapped paired reads (%)"].append(
        str(int(100 * total_read_pairs * 2.0 / total_read_count)))
    table["median insert size"].append(str(max_insert / max_insert_relative))
    table["paired reads < 50 bp apart"].append(str(too_short))
    table["max insert"].append(str(max_insert))
    table["paired reads > max insert apart"].append(str(too_long))
    table["reads which also pass both pair insert size filters"].append(
        str(good_length))
    table["reads which also pass both pair insert size filters (%)"].append(
        str(int(100 * float(good_length) / total_read_count)))
    table["minimum mapq threshold"].append(str(min_mapq))
    table["reads which pass minimum mapq threshold"].append(str(mapq_good))
    table["reads which pass minimum mapq threshold (%)"].append(
        str(int(100 * float(mapq_good) / total_read_count)))
    table['minimum PID'].append(str(filter_cutoff))
    table["(final) reads which also pass read pair PID"].append(
        filter_cutoff_good)
    table["(final) reads which also pass read pair PID (%)"].append(
        str(int(100 * float(filter_cutoff_good) / total_read_count)))
    table["(final) expected coverage"].append(
        str(float(filter_cutoff_good) * mapped_read_lengths / fasta_length))
    Rdb = pd.DataFrame(table)

    logging.debug("**READ STATSTICS**")
    logging.debug("total reads found: " + str(total_read_count))
    logging.debug("average mapped read length: " + str(mapped_read_lengths))
    logging.debug("total fasta length: " + str(fasta_length))
    logging.debug(
        "expected possible coverage: " +
        str(float(total_read_count) * mapped_read_lengths / fasta_length))
    logging.debug("total paired reads: " + str(total_read_pairs * 2) + " (" +
                  str(int(100 * total_read_pairs * 2.0 / total_read_count)) +
                  "%)")
    logging.debug("total same scaffold mapped paired reads: " +
                  str(total_mapped_pairs * 2) + " (" +
                  str(int(100 * total_mapped_pairs * 2.0 / total_read_count)) +
                  "%)")
    logging.debug("")
    logging.debug("median insert size: " +
                  str(max_insert / max_insert_relative))
    logging.debug("paired reads < 50 bp apart: " + str(too_short))
    logging.debug("paired reads > " + str(max_insert) + " apart: " +
                  str(too_long))
    logging.debug("reads which also pass both pair insert size filters: " +
                  str(good_length) + " (" +
                  str(int(100 * float(good_length) / total_read_count)) + "%)")
    logging.debug("reads which pass minimum mapq threshold of " +
                  str(min_mapq) + ": " + str(mapq_good) + " (" +
                  str(int(100 * float(mapq_good) / total_read_count)) + "%)")
    logging.debug("(final) reads which also pass read pair PID >" +
                  str(filter_cutoff) + "%: " + str(filter_cutoff_good) + " (" +
                  str(int(100 * float(filter_cutoff_good) /
                          total_read_count)) + "%)")
    logging.debug(
        "(final) expected coverage: " +
        str(float(filter_cutoff_good) * mapped_read_lengths / fasta_length))

    ## STEP 3: WRITE DATA IF NEEDED
    if write_data:
        f = open(write_data, 'w+')
        for read_pair in mapped_pairs:
            f.write(read_pair + "\t" + "\t" + str(pair_inserts[read_pair]) +
                    "\t" + str(pair_mapqs[read_pair]) + "\t" +
                    str(pair_mismatch[read_pair]) + "\n")
        f.close()
    ## STEP 4: WRITE NEW BAM IF NEEDED

    samfile.close()
    if write_bam:
        samfile_out.close()
        logging.info("sorting new bam")
        pysam.sort("-o",
                   bam.split("/")[-1].split(".")[0] + "_filtered_sort.bam",
                   bam.split("/")[-1].split(".")[0] + "_filtered.bam")
        os.system('rm ' + bam.split("/")[-1].split(".")[0] + "_filtered.bam")

    return final_reads, Rdb
Esempio n. 49
0
        if read2 and read2.flag & 0x4:
            print read

        if read1 is None and read2 is None:
            print "Somehow we ended up with a double empty"
            assert False
        elif read1 is not None and read2 is not None:
            print "Somehow we didn't clear out the double-map"
            assert False
        elif read2 is None:
            read1.rnext = -1
            read1.pnext = 0
            read1.tlen = 0
            read1.flag = read1.flag | 0x8
            outfile.write(read1)
        elif read1 is None:
            read2.rnext = -1
            read2.pnext = 0
            read2.tlen = 0
            read2.flag = read2.flag | 0x8
            outfile.write(read2)
        else:
            print "How did we get here?"
            assert False
    filename = outfile.filename
    outfile.close()

    sorted_filename = filename[:filename.index('_unsorted')]
    print "Sorting into", sorted_filename
    pysam.sort('-m', "%d" % 3e9, filename, sorted_filename)
Esempio n. 50
0
def main(inputs,
         output,
         bam_file,
         strand_specific,
         library,
         protocol,
         median_fragment_size,
         stdev_fragment_size,
         read_length,
         reference_genome,
         annotations,
         masking,
         aligner_reference,
         start_time=int(time.time()),
         **kwargs):
    """
    Args:
        inputs (list): list of input files containing the breakpoint pairs
        output (str): path to the output directory
        bam_file (str): path the bam file
        strand_specific (bool): flag to indicate the input bam is using a strand specific protocol
        median_fragment_size (int): the median fragment size
        stdev_fragment_size (int): the standard deviation in fragment size
        read_length (int): read length
        reference_genome (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genome`
        annotations (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_reference_genes`
        masking (:class:`~mavis.annotate.file_io.ReferenceFile`): see :func:`~mavis.annotate.file_io.load_masking_regions`
        aligner_reference (:class:`~mavis.annotate.file_io.ReferenceFile`): path to the aligner reference file (e.g 2bit file for blat)
    """
    mkdirp(output)
    # check the files exist early to avoid waiting for errors
    if protocol == PROTOCOL.TRANS:
        annotations.load()
    reference_genome.load()
    masking.load()

    validation_settings = {}
    validation_settings.update(DEFAULTS.items())
    validation_settings.update(
        {k: v
         for k, v in kwargs.items() if k in DEFAULTS})
    validation_settings = MavisNamespace(**validation_settings)

    raw_evidence_bam = os.path.join(output, 'raw_evidence.bam')
    contig_bam = os.path.join(output, 'contigs.bam')
    evidence_bed = os.path.join(output, 'evidence.bed')

    passed_output_file = os.path.join(output, PASS_FILENAME)
    passed_bed_file = os.path.join(output, 'validation-passed.bed')
    failed_output_file = os.path.join(output, 'validation-failed.tab')
    contig_aligner_fa = os.path.join(output, 'contigs.fa')
    if validation_settings.aligner == SUPPORTED_ALIGNER.BLAT:
        contig_aligner_output = os.path.join(output, 'contigs.blat_out.pslx')
        contig_aligner_log = os.path.join(output, 'contigs.blat.log')
    elif validation_settings.aligner == SUPPORTED_ALIGNER.BWA_MEM:
        contig_aligner_output = os.path.join(output, 'contigs.bwa_mem.sam')
        contig_aligner_log = os.path.join(output, 'contigs.bwa_mem.log')
    else:
        raise NotImplementedError('unsupported aligner',
                                  validation_settings.aligner)
    igv_batch_file = os.path.join(output, 'igv.batch')
    input_bam_cache = BamCache(bam_file, strand_specific)

    bpps = read_inputs(
        inputs,
        add_default={
            COLUMNS.cluster_id: None,
            COLUMNS.stranded: False
        },
        add={
            COLUMNS.protocol: protocol,
            COLUMNS.library: library
        },
        expand_strand=False,
        expand_orient=True,
        cast={COLUMNS.cluster_id: lambda x: str(uuid()) if not x else x})
    evidence_clusters = []
    for bpp in bpps:
        if bpp.data[COLUMNS.protocol] == PROTOCOL.GENOME:
            try:
                evidence = GenomeEvidence(
                    bpp.break1,
                    bpp.break2,
                    input_bam_cache,
                    reference_genome.content,
                    opposing_strands=bpp.opposing_strands,
                    stranded=bpp.stranded,
                    untemplated_seq=bpp.untemplated_seq,
                    data=bpp.data,
                    stdev_fragment_size=stdev_fragment_size,
                    read_length=read_length,
                    median_fragment_size=median_fragment_size,
                    **dict(validation_settings.items()))
                evidence_clusters.append(evidence)
            except ValueError as err:
                warnings.warn(
                    'Dropping breakpoint pair ({}) as bad input {}'.format(
                        str(bpp), str(err)))
        elif bpp.data[COLUMNS.protocol] == PROTOCOL.TRANS:
            try:
                evidence = TranscriptomeEvidence(
                    annotations.content,
                    bpp.break1,
                    bpp.break2,
                    input_bam_cache,
                    reference_genome.content,
                    opposing_strands=bpp.opposing_strands,
                    stranded=bpp.stranded,
                    untemplated_seq=bpp.untemplated_seq,
                    data=bpp.data,
                    stdev_fragment_size=stdev_fragment_size,
                    read_length=read_length,
                    median_fragment_size=median_fragment_size,
                    **dict(validation_settings.items()))
                evidence_clusters.append(evidence)
            except ValueError as err:
                warnings.warn('Dropping ({}) as bad input {}'.format(
                    str(bpp), str(err)))
        else:
            raise ValueError('protocol error', bpp.data[COLUMNS.protocol])

    extended_masks = {}
    for chrom, masks in masking.content.items(
    ):  # extend masking by read length
        extended_masks[chrom] = []
        for mask in masks:
            extended_masks[chrom].append(
                BioInterval(chrom,
                            mask.start - read_length,
                            mask.end + read_length,
                            name=mask.name))

    evidence_clusters, filtered_evidence_clusters = filter_on_overlap(
        evidence_clusters, extended_masks)
    contig_sequences = {}
    for i, evidence in enumerate(evidence_clusters):
        LOG()
        LOG('({} of {})'.format(i + 1, len(evidence_clusters)),
            'gathered evidence for:',
            evidence.cluster_id,
            '' if COLUMNS.tracking_id not in evidence.data else
            '(tracking_id: {})'.format(evidence.tracking_id),
            time_stamp=True)
        LOG(evidence, time_stamp=False)
        LOG('possible event type(s):',
            BreakpointPair.classify(evidence),
            time_stamp=False)
        LOG('outer window regions:  {}:{}-{}  {}:{}-{}'.format(
            evidence.break1.chr, evidence.outer_window1[0],
            evidence.outer_window1[1], evidence.break2.chr,
            evidence.outer_window2[0], evidence.outer_window2[1]),
            time_stamp=False)
        LOG('inner window regions:  {}:{}-{}  {}:{}-{}'.format(
            evidence.break1.chr, evidence.inner_window1[0],
            evidence.inner_window1[1], evidence.break2.chr,
            evidence.inner_window2[0], evidence.inner_window2[1]),
            time_stamp=False)
        evidence.load_evidence(log=LOG)
        LOG('flanking pairs: {};'.format(len(evidence.flanking_pairs)),
            'split reads: {}, {};'.format(
                *[len(a) for a in evidence.split_reads]),
            'half-mapped reads: {}, {};'.format(
                *[len(a) for a in evidence.half_mapped]),
            'spanning-reads: {};'.format(len(evidence.spanning_reads)),
            'compatible flanking pairs:',
            len(evidence.compatible_flanking_pairs),
            time_stamp=False)
        evidence.assemble_contig(log=LOG)
        LOG('assembled {} contigs'.format(len(evidence.contigs)),
            time_stamp=False)
        for contig in evidence.contigs:
            name = 'seq-{}'.format(
                hashlib.md5(contig.seq.encode('utf-8')).hexdigest())
            LOG('>',
                name,
                '(size={}; reads={:.0f}; coverage={:.2f})'.format(
                    len(contig.seq), contig.remap_score(),
                    contig.remap_coverage()),
                time_stamp=False)
            LOG(contig.seq[:140], time_stamp=False)
            contig_sequences[name] = contig.seq

    LOG('will output:', contig_aligner_fa, contig_aligner_output)
    raw_contig_alignments = align_sequences(
        contig_sequences,
        input_bam_cache,
        reference_genome=reference_genome.content,
        aligner_fa_input_file=contig_aligner_fa,
        aligner_output_file=contig_aligner_output,
        clean_files=validation_settings.clean_aligner_files,
        aligner=kwargs.get('aligner', validation_settings.aligner),
        aligner_reference=aligner_reference.name[0],
        aligner_output_log=contig_aligner_log,
        blat_min_identity=kwargs.get('blat_min_identity',
                                     validation_settings.blat_min_identity),
        blat_limit_top_aln=kwargs.get('blat_limit_top_aln',
                                      validation_settings.blat_limit_top_aln),
        log=LOG)
    for evidence in evidence_clusters:
        select_contig_alignments(evidence, raw_contig_alignments)
    LOG('alignment complete', time_stamp=True)
    event_calls = []
    total_pass = 0
    write_bed_file(
        evidence_bed,
        itertools.chain.from_iterable(
            [e.get_bed_repesentation() for e in evidence_clusters]))
    validation_counts = {}
    for index, evidence in enumerate(evidence_clusters):
        LOG()
        LOG('({} of {}) calling events for: {} {} (tracking_id: {})'.format(
            index + 1, len(evidence_clusters), evidence.cluster_id,
            evidence.putative_event_types(), evidence.tracking_id),
            time_stamp=True)
        LOG('source:', evidence)
        calls = []
        failure_comment = None
        try:
            calls = call_events(evidence)
            event_calls.extend(calls)
        except UserWarning as err:
            LOG('warning: error in calling events', repr(err))
            failure_comment = str(err)

        if not calls:
            failure_comment = [
                'zero events were called'
            ] if failure_comment is None else failure_comment
            evidence.data[COLUMNS.filter_comment] = failure_comment
            filtered_evidence_clusters.append(evidence)
        else:
            total_pass += 1

        LOG('called {} event(s)'.format(len(calls)), time_stamp=True)
        for call in calls:
            LOG(call)
            if call.call_method == CALL_METHOD.CONTIG:
                LOG('\t{} {} [{}] contig_alignment_score: {}, contig_alignment_mq: {} contig_alignment_rank: {}'
                    .format(call.event_type, call.call_method,
                            call.contig_alignment.query_name,
                            round(call.contig_alignment.score(), 2),
                            tuple(call.contig_alignment.mapping_quality()),
                            tuple(call.contig_alignment.alignment_rank())))
                LOG('\talignment:', call.contig_alignment.alignment_id())
            elif call.contig_alignment:
                LOG(
                    '\t{} {} alignment:'.format(call.event_type,
                                                call.call_method),
                    call.contig_alignment.alignment_id())
            else:
                LOG('\t{} {}'.format(call.event_type, call.call_method),
                    time_stamp=False)
            validation_counts[call.cluster_id] = validation_counts.get(
                call.cluster_id, 0) + 1
            call.data[COLUMNS.validation_id] = '{}-v{}'.format(
                call.cluster_id, validation_counts[call.cluster_id])
            LOG('\tremapped reads: {}; spanning reads: {}; split reads: [{} ({}), {} ({}), {}]'
                ', flanking pairs: {}{}'.format(
                    0 if not call.contig else len(call.contig.input_reads),
                    len(call.spanning_reads),
                    len(call.break1_split_read_names()),
                    len(call.break1_split_read_names(tgt=True)),
                    len(call.break2_split_read_names()),
                    len(call.break2_split_read_names(tgt=True)),
                    len(call.linking_split_read_names()),
                    len(call.flanking_pairs),
                    '' if not call.has_compatible else '(' +
                    str(len(call.compatible_flanking_pairs)) + ')'))

    # write the output validated clusters (split by type and contig)
    for i, call in enumerate(event_calls):
        b1_homseq = None
        b2_homseq = None
        try:
            b1_homseq, b2_homseq = call.breakpoint_sequence_homology(
                reference_genome.content)
        except AttributeError:
            pass
        call.data.update({
            COLUMNS.break1_homologous_seq: b1_homseq,
            COLUMNS.break2_homologous_seq: b2_homseq,
        })
    LOG('{} putative calls resulted in {} events with 1 or more event call'.
        format(len(evidence_clusters), total_pass),
        time_stamp=True)
    output_tabbed_file(event_calls, passed_output_file)
    output_tabbed_file(filtered_evidence_clusters, failed_output_file)
    write_bed_file(
        passed_bed_file,
        itertools.chain.from_iterable(
            [e.get_bed_repesentation() for e in event_calls]))

    if validation_settings.write_evidence_files:
        with pysam.AlignmentFile(contig_bam, 'wb',
                                 template=input_bam_cache.fh) as fh:
            LOG('writing:', contig_bam, time_stamp=True)
            for evidence in evidence_clusters:
                for contig in evidence.contigs:
                    for aln in contig.alignments:
                        aln.read1.cigar = _cigar.convert_for_igv(
                            aln.read1.cigar)
                        fh.write(aln.read1)
                        if aln.read2:
                            aln.read2.cigar = _cigar.convert_for_igv(
                                aln.read2.cigar)
                            fh.write(aln.read2)

        # write the evidence
        with pysam.AlignmentFile(raw_evidence_bam,
                                 'wb',
                                 template=input_bam_cache.fh) as fh:
            LOG('writing:', raw_evidence_bam, time_stamp=True)
            reads = set()
            for evidence in evidence_clusters:
                reads.update(evidence.supporting_reads())
            for read in reads:
                read.cigar = _cigar.convert_for_igv(read.cigar)
                fh.write(read)
        # now sort the contig bam
        sort = re.sub(r'.bam$', '.sorted.bam', contig_bam)
        LOG('sorting the bam file:', contig_bam, time_stamp=True)
        pysam.sort('-o', sort, contig_bam)
        contig_bam = sort
        LOG('indexing the sorted bam:', contig_bam)
        pysam.index(contig_bam)

        # then sort the evidence bam file
        sort = re.sub(r'.bam$', '.sorted.bam', raw_evidence_bam)
        LOG('sorting the bam file:', raw_evidence_bam, time_stamp=True)
        pysam.sort('-o', sort, raw_evidence_bam)
        raw_evidence_bam = sort
        LOG('indexing the sorted bam:', raw_evidence_bam)
        pysam.index(raw_evidence_bam)

        # write the igv batch file
        with open(igv_batch_file, 'w') as fh:
            LOG('writing:', igv_batch_file, time_stamp=True)

            fh.write('load {} name="{}"\n'.format(passed_bed_file,
                                                  'passed events'))
            fh.write('load {} name="{}"\n'.format(contig_bam,
                                                  'aligned contigs'))
            fh.write('load {} name="{}"\n'.format(evidence_bed,
                                                  'evidence windows'))
            fh.write('load {} name="{}"\n'.format(raw_evidence_bam,
                                                  'raw evidence'))
            fh.write('load {} name="{} {} input"\n'.format(
                bam_file, library, protocol))
Esempio n. 51
0
def buildNormalizedBAM(infiles, outfile, normalize=True):
    '''build a normalized BAM file.

    Infiles are merged and duplicated reads are removed.  If
    *normalize* is set, reads are removed such that all files will
    have approximately the same number of reads.

    Note that the duplication here is wrong as there
    is no sense of strandedness preserved.

    '''

    min_reads = getMinimumMappedReads(glob.glob("*.readstats"))

    samfiles = []
    num_reads = 0
    for infile, statsfile in infiles:
        samfiles.append(pysam.Samfile(infile, "rb"))
        num_reads += getMappedReads(statsfile)

    threshold = float(min_reads) / num_reads

    E.info("%s: min reads: %i, total reads=%i, threshold=%f" %
           (infiles, min_reads, num_reads, threshold))

    pysam_out = pysam.Samfile(outfile, "wb", template=samfiles[0])

    ninput, noutput, nduplicates = 0, 0, 0

    # iterate over mapped reads
    last_contig, last_pos = None, None
    for pysam_in in samfiles:
        for read in pysam_in.fetch():

            ninput += 1
            if read.rname == last_contig and read.pos == last_pos:
                nduplicates += 1
                continue

            if normalize and random.random() <= threshold:
                pysam_out.write(read)
                noutput += 1

            last_contig, last_pos = read.rname, read.pos

        pysam_in.close()

    pysam_out.close()

    logs = IOTools.openFile(outfile + ".log", "w")
    logs.write("# min_reads=%i, threshold= %5.2f\n" %
               (min_reads, threshold))
    logs.write("set\tcounts\tpercent\n")
    logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0))
    nwithout_dups = ninput - nduplicates
    logs.write("duplicates\t%i\t%5.2f%%\n" %
               (nduplicates, 100.0 * nduplicates / ninput))
    logs.write("without duplicates\t%i\t%5.2f%%\n" %
               (nwithout_dups, 100.0 * nwithout_dups / ninput))
    logs.write("target\t%i\t%5.2f%%\n" %
               (min_reads, 100.0 * min_reads / nwithout_dups))
    logs.write("noutput\t%i\t%5.2f%%\n" %
               (noutput, 100.0 * noutput / nwithout_dups))

    logs.close()

    # if more than one samfile: sort
    if len(samfiles) > 1:
        tmpfilename = P.getTempFilename(".")
        pysam.sort(outfile, tmpfilename)
        shutil.move(tmpfilename + ".bam", outfile)
        os.unlink(tmpfilename)

    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, min_reads))
Esempio n. 52
0
def picVal(opts=None):
    """
    called with sam so no need to convert
    """
    assert opts <> None
    killme = string.punctuation + string.whitespace
    trantab = string.maketrans(killme, '_' * len(killme))
    title = opts.title.translate(trantab)
    tempout = os.path.join(opts.output_dir, 'rgPicardValidate.out')
    temptab = os.path.join(opts.output_dir, 'rgPicardValidate.xls')
    opts.log_file = opts.log or os.path.join(opts.output_dir,
                                             'rgPicardValidate_%s.log' % title)
    # Create output folder and save our R script in there.
    stf = open(opts.log_file, 'w')
    sortedfile = None
    if verbose:
        print '# opts.ignore', opts.ignore, ' opts.sortme=', opts.sortme
    if opts.sortme:
        fd, sortedfile = tempfile.mkstemp(suffix='rgcleansam.sorted.bam')
        if opts.datatype == 'sam':  # need to work with a bam
            tempbam = samToBam(opts.input, opts.outdir)
            pysam.sort(tempbam, sortedfile)
        else:  # is already bam
            pysam.sort(opts.input, sortedfile)
    cl = [
        'java -Xmx', opts.maxjheap, ' -jar ', opts.jar, ' O=', tempout,
        ' TMP_DIR=', opts.tmp_dir
    ]
    if verbose:
        print '# cl so far', cl
    if opts.sortme:
        cl.append(' I=%s' % sortedfile)
    else:
        cl.append(' I=%s' % opts.input)
    if opts.maxoutput == '0':
        opts.maxoutput = '65535'
    cl.append(' MAX_OUTPUT=%s' % opts.maxoutput)
    if opts.ignore[0] <> 'None':  # picard error values to ignore
        cl += [' IGNORE=%s' % x for x in opts.ignore if x <> 'None']
    if opts.bisulphite.lower() <> 'false':
        cl.append(' IS_BISULFITE_SEQUENCED=true')
    if opts.refseq <> '':
        cl += [
            ' R=%s' % opts.refseq,
        ]
    s1 = ' '.join(['"%s"' % x for x in cl])
    s = '## rgPicardValidate.py about to Popen:\n%s\n' % s1
    stf.write(s)
    if verbose:
        print s
    pefilename = os.path.join(opts.output_dir,
                              'rgPicardValidate_%s.errors' % title)
    picerrors = open(pefilename, 'w')
    process = Popen(''.join(cl),
                    shell=True,
                    stderr=picerrors,
                    stdout=picerrors,
                    cwd=opts.output_dir)
    return_value = process.wait()
    picerrors.close()
    pe = open(pefilename, 'r').readlines()
    stf.write('## got %d rows - first few =%s\n' %
              (len(pe), '\n'.join(pe[:5])))
    if opts.dryrun <> 'dryrun':  # want to run cleansam
        if opts.dryrun == 'sam':
            outformat = 'sam'
            newsam = opts.sam
        elif opts.dryrun == 'bam':
            outformat = 'bam'
            newsam = opts.bam
        cleanSam(insam=opts.input,
                 newsam=newsam,
                 picardErrors=pe,
                 outformat=outformat,
                 sortme=opts.sortme)
    stf.close()
    fixPicardOutputs(tempout=tempout,
                     output_dir=opts.output_dir,
                     log_file=opts.log_file,
                     html_output=opts.html_output,
                     progname=progname,
                     cl=cl,
                     transpose=False)
    if opts.sortme:
        os.unlink(sortedfile)
        if opts.datatype == 'sam':  # was converted
            os.unlink(tempbam)  # temporary
Esempio n. 53
0
def filter_bam_multihits(filename, max_hits, tmp_dir, read_tagger, omit_detail=True):
	"""Pre-processing function for cleaning up the input bam file.
	Args:
	Returns:
	"""
	logger.info('Filtering input bam..')
	
	in_bam = pysam.Samfile(filename,'rb')
	# unique read bam
	ubam_fn = os.path.join(tmp_dir, 'unique.bam')
	sorted_ubam_fn = os.path.join(tmp_dir, 'unique.sorted.bam')
	ubam=pysam.Samfile(ubam_fn, 'wb', template=in_bam)
	unique_counter = 0
	
	# multi-read bam
	mbam_fn = os.path.join(tmp_dir, 'multi.bam')
	sorted_mbam_fn = os.path.join(tmp_dir, 'multi.sorted.bam')
	mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam)
	mread_set = set()
	
	# splitting unique and multi- reads
	# and add the read taggers we need
	for read in tqdm(in_bam):
		read_tag = read_tagger(read)
		## skip reads with unassigned tagger
		if read_tag==-1:
			continue
		read.tags += [('RT', read_tag)] ## add the tag
		## omit the details in read sequence and quality
		## recommended for larger bam because this
		## can save some memory/storage for large bams
		if omit_detail:
			read.query_sequence = '*'
			read.query_qualities = [0]
		if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
			try:
				if read.opt("NH") < max_hits:
					mbam.write(read)
					mread_set.add(read.qname)
			except KeyError:
				#print read
				raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary))
		else:
			ubam.write(read)
			unique_counter += 1
	
	in_bam.close()
	ubam.close()
	mbam.close()
	
	# sorting
	pysam.sort('-o', sorted_ubam_fn, ubam_fn)
	os.remove(ubam_fn)
	pysam.sort('-o', sorted_mbam_fn, mbam_fn)
	os.remove(mbam_fn)
	pysam.index(sorted_ubam_fn)
	pysam.index(sorted_mbam_fn)
	
	# log the statistics
	multi_counter = len(mread_set)
	logger.info(
			'Unique reads = %s;  ' % unique_counter + \
			'Multi reads = %s (%.2f %%)' % \
			( multi_counter, float(multi_counter)/(multi_counter+unique_counter)*100 )
		)
	return
Esempio n. 54
0
            okread.is_paired = True
            okread.is_read1 = is_first
            okread.is_read2 = not is_first
            sout.write(okread)

    os.remove(mapped5)
    os.remove(mapped3)
    os.remove(mappedUs)

    # Sorting the resulting file. We do it this way because the original
    # file wasn't guaranteed to be sorted. To try to merge the three files
    # (split, 5' and 3'), we'd need to assume some sorting order and I'm
    # not willing to do that. Sorting afterward enforces the 'samtools'
    # name ordering over anything that might have been there originally.
    bsorted = os.path.join(tmpdir, "sorted.bam")
    pysam.sort("-o", bsorted, "-n", outbam)
    os.rename(bsorted, outbam)

####################################################################################################
# Stitching two files together to reform a single BAM file.

cmd = ["-n", "-f", args.output] + allnames
pysam.merge(*cmd)
for x in allnames:
    os.remove(x)

# Mopping up.
import shutil
shutil.rmtree(tmpdir)
dumpf.close()
def main():
    parser = ArgumentParser()
    parser.add_argument('--input',
                        dest='in_bam',
                        required=True,
                        help='Path to unaligned, paired-end, bam file.')
    parser.add_argument('--taglen',
                        dest='tag_len',
                        type=int,
                        default=12,
                        help='Length in bases of the duplex tag sequence.[12]')
    parser.add_argument(
        '--spacerlen',
        dest='spcr_len',
        type=int,
        default=5,
        help=
        'Length in bases of the spacer sequence between duplex tag and the start of target DNA. [5]'
    )
    parser.add_argument("--tagstats",
                        dest='tagstats',
                        action="store_true",
                        help="output tagstats file")
    parser.add_argument(
        '--minmem',
        dest='minmem',
        type=int,
        default=3,
        help="Minimum number of reads allowed to comprise a consensus. [3]")
    parser.add_argument(
        '--maxmem',
        dest='maxmem',
        type=int,
        default=200,
        help="Maximum number of reads allowed to comprise a consensus. [200]")
    parser.add_argument(
        '--cutoff',
        dest='cutoff',
        type=float,
        default=.7,
        help=
        "Percentage of nucleotides at a given position in a read that must be identical in order "
        "for a consensus to be called at that position. [0.7]")
    parser.add_argument(
        '--Ncutoff',
        dest='Ncutoff',
        type=float,
        default=1,
        help=
        "With --filt 'n', maximum fraction of Ns allowed in a consensus [1.0]")
    parser.add_argument('--write-sscs',
                        dest='write_sscs',
                        action="store_true",
                        help="Print the SSCS reads to file in FASTQ format")
    parser.add_argument('--without-dcs',
                        dest='without_dcs',
                        action="store_true",
                        help="Don't print final DCS reads")
    parser.add_argument(
        "--rep_filt",
        action="store",
        type=int,
        dest='rep_filt',
        help="Remove tags with homomeric runs of nucleotides of length x. [9]",
        default=9)
    parser.add_argument('--prefix',
                        dest='prefix',
                        type=str,
                        required=True,
                        help="Sample name to uniquely identify samples")
    o = parser.parse_args()

    dummy_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [{
            'LN': 1575,
            'SN': 'chr1'
        }, {
            'LN': 1584,
            'SN': 'chr2'
        }]
    }
    in_bam_file = pysam.AlignmentFile(o.in_bam, "rb", check_sq=False)
    temp_bam = pysam.AlignmentFile(o.prefix + ".temp.bam",
                                   'wb',
                                   header=dummy_header)
    paired_end_count = 1

    if o.write_sscs is True:

        read1_sscs_fq_file = gzip.open(o.prefix + '_read1_sscs.fq.gz', 'wb')
        read2_sscs_fq_file = gzip.open(o.prefix + '_read2_sscs.fq.gz', 'wb')

    if o.without_dcs is False:
        read1_dcs_fq_file = gzip.open(o.prefix + '_read1_dcs.fq.gz', 'wb')
        read2_dcs_fq_file = gzip.open(o.prefix + '_read2_dcs.fq.gz', 'wb')
    '''This block of code takes an unaligned bam file, extracts the tag sequences from the reads, and converts them to
	to "ab/ba" format where 'a' and 'b' are the tag sequences from Read 1 and Read 2, respectively. Conversion occurs by
	putting the tag with the "lesser" value in front of the tag with the "higher" value. The original tag orientation is
	denoted by appending #ab or #ba to the end of the tag. After conversion, the resulting temporary bam file is then
	sorted by read name.'''

    print "Parsing tags..."

    for line in in_bam_file.fetch(until_eof=True):

        if paired_end_count % 2 == 1:

            temp_read1_entry = pysam.AlignedSegment()
            temp_read1_entry.query_name = line.query_name
            temp_read1_entry.query_sequence = line.query_alignment_sequence
            temp_read1_entry.query_qualities = line.query_alignment_qualities

        if paired_end_count % 2 == 0:

            temp_bam_entry = pysam.AlignedSegment()

            if temp_read1_entry.query_sequence[:o.
                                               tag_len] > line.query_alignment_sequence[:o
                                                                                        .
                                                                                        tag_len]:
                temp_bam_entry.query_name = temp_read1_entry.query_sequence[:o.tag_len] + \
                      line.query_alignment_sequence[:o.tag_len] + '#ab'

            elif temp_read1_entry.query_sequence[:o.
                                                 tag_len] < line.query_alignment_sequence[:o
                                                                                          .
                                                                                          tag_len]:
                temp_bam_entry.query_name = line.query_alignment_sequence[:o.tag_len] + \
                       temp_read1_entry.query_sequence[:o.tag_len] + '#ba'

            elif temp_read1_entry.query_sequence[:o.
                                                 tag_len] == line.query_alignment_sequence[:
                                                                                           o
                                                                                           .
                                                                                           tag_len]:
                paired_end_count += 1
                continue

            # Write entries for Read 1
            temp_bam_entry.query_name += ":1"
            temp_bam_entry.query_sequence = temp_read1_entry.query_sequence[
                o.tag_len + o.spcr_len:]
            temp_bam_entry.query_qualities = temp_read1_entry.query_qualities[
                o.tag_len + o.spcr_len:]
            temp_bam_entry.set_tag('X?', temp_read1_entry.query_name, 'Z')
            temp_bam.write(temp_bam_entry)

            # Write entries for Read 2
            temp_bam_entry.query_name = temp_bam_entry.query_name.replace(
                '1', '2')
            temp_bam_entry.query_sequence = line.query_sequence[o.tag_len +
                                                                o.spcr_len:]
            temp_bam_entry.query_qualities = line.query_qualities[o.tag_len +
                                                                  o.spcr_len:]
            temp_bam_entry.set_tag('X?', line.query_name, 'Z')
            temp_bam.write(temp_bam_entry)

        paired_end_count += 1

    in_bam_file.close()
    temp_bam.close()

    print "Sorting reads on tag sequence..."

    pysam.sort("-n", o.prefix + ".temp.bam", "-o", o.prefix +
               ".temp.sort.bam")  # Sort by read name, which will be the
    # tag sequence in this case.
    os.remove(o.prefix + ".temp.bam")
    '''Extracting tags and sorting based on tag sequence is complete. This block of code now performs the consensus
	calling on the tag families in the temporary name sorted bam file.'''
    seq_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []}
    qual_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []}
    fam_size_x_axis = []
    fam_size_y_axis = []

    read1_dcs_len = 0
    read2_dcs_len = 0
    in_bam_file = pysam.AlignmentFile(o.prefix + '.temp.sort.bam',
                                      "rb",
                                      check_sq=False)
    first_line = in_bam_file.next()

    FinalValue = pysam.AlignedSegment()
    FinalValue.query_name = "FinalValue#ab:1"

    seq_dict[first_line.query_name.split('#')[1]].append(
        first_line.query_sequence)
    qual_dict[first_line.query_name.split('#')[1]].append(
        list(first_line.query_qualities))
    tag_count_dict = defaultdict(lambda: 0)

    print "Creating consensus reads..."

    for line in iteratorWrapper(in_bam_file.fetch(until_eof=True), FinalValue):
        tag, subtag_order = first_line.query_name.split(
            '#')[0], first_line.query_name.split('#')[1]

        if line.query_name.split('#')[0] == tag:
            seq_dict[line.query_name.split('#')[1]].append(line.query_sequence)
            qual_dict[line.query_name.split('#')[1]].append(
                list(line.query_qualities))

        else:

            if len(seq_dict['ab:1']) != len(seq_dict['ab:2']) or len(
                    seq_dict['ba:1']) != len(seq_dict['ba:2']):
                raise Exception(
                    'ERROR: Read counts for Read1 and Read 2 do not match for tag %s'
                    % tag)

            for tag_subtype in seq_dict.keys():

                if len(seq_dict[tag_subtype]) > 0:
                    tag_count_dict[len(seq_dict[tag_subtype])] += 1

                if len(seq_dict[tag_subtype]) < o.minmem:
                    seq_dict[tag_subtype] = []
                    qual_dict[tag_subtype] = []

                elif o.minmem <= len(
                        seq_dict[tag_subtype]
                ) <= o.maxmem:  # Tag types w/o reads should not be submitted
                    #  as long as minmem is > 0
                    seq_dict[tag_subtype] = [
                        consensus_caller(seq_dict[tag_subtype], o.cutoff, tag,
                                         True),
                        str(len(seq_dict[tag_subtype]))
                    ]
                    qual_dict[tag_subtype] = qual_calc(qual_dict[tag_subtype])

                elif len(seq_dict[tag_subtype]) > o.maxmem:
                    seq_dict[tag_subtype] = [
                        consensus_caller(seq_dict[tag_subtype][:o.maxmem],
                                         o.cutoff, tag, True),
                        str(len(seq_dict[tag_subtype]))
                    ]
                    qual_dict[tag_subtype] = qual_calc(qual_dict[tag_subtype])

            if o.write_sscs is True:

                if len(seq_dict['ab:1']) != 0 and len(seq_dict['ab:2']) != 0:
                    corrected_qual_score = map(lambda x: x if x < 41 else 41,
                                               qual_dict['ab:1'])
                    read1_sscs_fq_file.write(
                        '@%s#ab/1\n%s\n+%s\n%s\n' %
                        (tag, seq_dict['ab:1'][0], seq_dict['ab:1'][1],
                         "".join(chr(x + 33) for x in corrected_qual_score)))

                    corrected_qual_score = map(lambda x: x if x < 41 else 41,
                                               qual_dict['ab:2'])
                    read2_sscs_fq_file.write(
                        '@%s#ab/2\n%s\n+%s\n%s\n' %
                        (tag, seq_dict['ab:2'][0], seq_dict['ab:2'][1],
                         "".join(chr(x + 33) for x in corrected_qual_score)))

                if len(seq_dict['ba:1']) != 0 and len(seq_dict['ba:2']) != 0:
                    corrected_qual_score = map(lambda x: x if x < 41 else 41,
                                               qual_dict['ba:1'])
                    read1_sscs_fq_file.write(
                        '@%s#ba/1\n%s\n+%s\n%s\n' %
                        (tag, seq_dict['ba:1'][0], seq_dict['ba:1'][1],
                         "".join(chr(x + 33) for x in corrected_qual_score)))

                    corrected_qual_score = map(lambda x: x if x < 41 else 41,
                                               qual_dict['ba:1'])
                    read2_sscs_fq_file.write(
                        '@%s#ba/2\n%s\n+%s\n%s\n' %
                        (tag, seq_dict['ba:2'][0], seq_dict['ba:2'][1],
                         "".join(chr(x + 33) for x in corrected_qual_score)))

            if o.without_dcs is False:

                if len(seq_dict['ab:1']) != 0 and len(seq_dict['ba:2']) != 0:
                    dcs_read_1 = [
                        consensus_caller(
                            [seq_dict['ab:1'][0], seq_dict['ba:2'][0]], 1, tag,
                            False), seq_dict['ab:1'][1], seq_dict['ba:2'][1]
                    ]
                    dcs_read_1_qual = map(
                        lambda x: x if x < 41 else 41,
                        qual_calc([qual_dict['ab:1'], qual_dict['ba:2']]))
                    read1_dcs_len = len(dcs_read_1)
                    fam_size_x_axis.append(int(seq_dict['ab:1'][1]))
                    fam_size_y_axis.append(int(seq_dict['ba:2'][1]))

                    if dcs_read_1.count('N') / float(
                            read1_dcs_len) > o.Ncutoff:
                        dcs_read_1 = 'N' * read1_dcs_len
                        dcs_read_1_qual = '!' * read1_dcs_len

                if len(seq_dict['ba:1']) != 0 and len(seq_dict['ab:2']) != 0:
                    dcs_read_2 = [
                        consensus_caller(
                            [seq_dict['ba:1'][0], seq_dict['ab:2'][0]], 1, tag,
                            False), seq_dict['ba:1'][1], seq_dict['ab:2'][1]
                    ]
                    dcs_read_2_qual = map(
                        lambda x: x if x < 41 else 41,
                        qual_calc([qual_dict['ba:1'], qual_dict['ab:2']]))
                    read2_dcs_len = len(dcs_read_2)

                    if dcs_read_2.count('N') / float(
                            read1_dcs_len) > o.Ncutoff:
                        dcs_read_2 = 'N' * read1_dcs_len
                        dcs_read_2_qual = '!' * read2_dcs_len

                if read1_dcs_len != 0 and read2_dcs_len != 0 and tag.count('N') == 0 and \
                      'A' * o.rep_filt not in tag and 'C' * o.rep_filt not in tag and \
                      'G' * o.rep_filt not in tag and 'T' * o.rep_filt not in tag:
                    read1_dcs_fq_file.write(
                        '@%s/1\n%s\n+%s:%s\n%s\n' %
                        (tag, dcs_read_1[0], dcs_read_1[1], dcs_read_1[2],
                         "".join(chr(x + 33) for x in dcs_read_1_qual)))
                    read2_dcs_fq_file.write(
                        '@%s/2\n%s\n+%s:%s\n%s\n' %
                        (tag, dcs_read_2[0], dcs_read_2[1], dcs_read_2[2],
                         "".join(chr(x + 33) for x in dcs_read_2_qual)))
            if line != FinalValue:
                # reset conditions for next tag family
                first_line = line
                seq_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []}
                qual_dict = {'ab:1': [], 'ab:2': [], 'ba:1': [], 'ba:2': []}
                read1_dcs_len = 0
                read2_dcs_len = 0
                dcs_read_1 = ''
                dcs_read_2 = ''

                seq_dict[line.query_name.split('#')[1]].append(
                    line.query_sequence
                )  # Now add initializing data for new tag
                qual_dict[first_line.query_name.split('#')[1]].append(
                    list(first_line.query_qualities))

    if o.write_sscs is True:
        read1_sscs_fq_file.close()
        read2_sscs_fq_file.close()

    if o.without_dcs is False:
        read1_dcs_fq_file.close()
        read2_dcs_fq_file.close()


# Try to plot the tag family sizes
    if o.tagstats is True:
        tag_stats_file = open(o.prefix + ".tagstats.txt", 'w')

        x_value = []
        y_value = []
        total_reads = sum([
            tag_count_dict[tag_family_size] * tag_family_size
            for tag_family_size in tag_count_dict.keys()
        ])

        for tag_family_size in sorted(tag_count_dict.keys()):
            fraction = (tag_count_dict[tag_family_size] *
                        tag_family_size) / float(total_reads)
            tag_stats_file.write(
                '%d\t%d\t%f\n' %
                (tag_family_size, tag_count_dict[tag_family_size], fraction))
            x_value.append(tag_family_size)
            y_value.append(fraction)

        try:
            import matplotlib
            matplotlib.use('Agg')
            import matplotlib.pyplot as plt

            plt.figure(1)
            plt.bar(x_value, y_value)
            plt.xlabel('Family Size')
            plt.ylabel('Proportion of Total Reads')
            plt.savefig(o.prefix + 'family_size.png', bbox_inches='tight')

            plt.figure(2)
            plt.scatter(fam_size_x_axis, fam_size_y_axis, alpha=.1)
            plt.xlabel('Family size for AB:1')
            plt.ylabel('Family size for BA:2')
            plt.xlim(0, max(fam_size_x_axis))
            plt.ylim(0, max(fam_size_y_axis))
            plt.savefig(o.prefix + 'fam_size_relation.png',
                        bbox_inches='tight')

        except ImportError:
            sys.stderr.write(
                'matplotlib not present. Only tagstats file will be generated.'
            )

        tag_stats_file.close()
Esempio n. 56
0
def make_gnashyfile(bcfilename, outpath, genome):
    #make chromosome list
    if genome == 'hs':
        chr_list = [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY'
        ]
        chr_dict = {
            'chr1': 1,
            'chr2': 2,
            'chr3': 3,
            'chr4': 4,
            'chr5': 5,
            'chr6': 6,
            'chr7': 7,
            'chr8': 8,
            'chr9': 9,
            'chr10': 10,
            'chr11': 11,
            'chr12': 12,
            'chr13': 13,
            'chr14': 14,
            'chr15': 15,
            'chr16': 16,
            'chr17': 17,
            'chr18': 18,
            'chr19': 19,
            'chr20': 20,
            'chr21': 21,
            'chr22': 22,
            'chrX': 23,
            'chrY': 24
        }
        print "making human gnashyfile"
    else:
        chr_list = [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chrX', 'chrY'
        ]
        chr_dict = {
            'chr1': 1,
            'chr2': 2,
            'chr3': 3,
            'chr4': 4,
            'chr5': 5,
            'chr6': 6,
            'chr7': 7,
            'chr8': 8,
            'chr9': 9,
            'chr10': 10,
            'chr11': 11,
            'chr12': 12,
            'chr13': 13,
            'chr14': 14,
            'chr15': 15,
            'chr16': 16,
            'chr17': 17,
            'chr18': 18,
            'chr19': 19,
            'chrX': 20,
            'chrY': 21
        }
        print "making mouse gnashyfile"
    #read in experiments and barcodes.  Key = (primer barcode, Transposon barode)
    #Value = expt name
    barcode_dict = read_barcode_file(bcfilename)
    #initialize quality control dictionary
    qc_dict = {}
    #LOOP THROUGH EXPERIMENTS
    #loop through experiments and make a separate gnashy file for each
    for expt in list(set(barcode_dict.values())):
        #for each experiment, there will be multiple bam files.  Loop through all of them
        #open output gnashyfile
        print "Analyzing " + expt
        output_filename = outpath + expt + ".gnashy"
        output_handle = file(output_filename, 'w')
        #LOOP THROUGH BAM FILES CORRESPONDING TO 1 experiment
        for key in barcode_dict.keys(
        ):  #this could be made more efficient, but its more clear this way
            if barcode_dict[key] == expt:
                primerBC = key[0]
                transposonBC = key[1]
                basename = outpath + expt + "_" + primerBC + "_" + transposonBC
                sbamFilename = basename + ".sorted"
                pysam.sort(basename + ".bam", sbamFilename)
                #sort and index bamfile
                sbamFilename = sbamFilename + ".bam"

                pysam.index(sbamFilename)
                print sbamFilename
                #inialize gnashy dictionary
                gnashy_dict = {}
                #make AlignmentFile object
                current_bamfile = pysam.AlignmentFile(sbamFilename, "rb")

                #loop through the chromosomes and pileup start sites
                for chr in chr_list:
                    aligned_reads_group = current_bamfile.fetch(chr)
                    #now loop through each read and pile up start sites
                    for aread in aligned_reads_group:
                        #is the read a reverse read?
                        if aread.is_reverse:
                            #does it align to a ttaa?
                            if (aread.query_sequence[-4:] == 'TTAA'
                                    or aread.query_sequence[-4:] == 'ttaa'):
                                #if so, get position and update dictionary
                                pos = aread.get_reference_positions()[-1]
                                if (chr, pos) in gnashy_dict:
                                    gnashy_dict[(chr, pos)] += 1
                                else:
                                    gnashy_dict[(chr, pos)] = 1
                        else:  #forward read
                            #does it align to a ttaa?
                            if (aread.query_sequence[0:4] == 'TTAA'
                                    or aread.query_sequence[0:4] == 'ttaa'):
                                #if so, get position and update dicitonary
                                pos = aread.get_reference_positions()[0]
                                if (chr, pos) in gnashy_dict:
                                    gnashy_dict[(chr, pos)] += 1
                                else:
                                    gnashy_dict[(chr, pos)] = 1
                #output dictionary to gnashy file
                for key in gnashy_dict:
                    output_handle.write(
                        "%s\t%s\t%s\n" %
                        (chr_dict[key[0]], key[1], gnashy_dict[key]))
        output_handle.close()
        #OPEN GNASHY FILE AND SORT BY CHR THEN POS
        qc_dict[expt] = sort_gnashy_file(output_filename)
    #after all experiments have been analyzed, print out qc
    qc_handle = file(outpath + "gnashyQC.txt", 'w')
    for key in qc_dict:
        qc_handle.write("%s\t%s\t%s\n" %
                        (key, qc_dict[key][0], qc_dict[key][1]))
    qc_handle.close()
Esempio n. 57
0
def testBlat(blc):
    if blc.count('1') > blc.count('0'): return 1
    return 0


#######
script_time = time.strftime("%d/%m/%Y %H:%M:%S", time.localtime(time.time()))
sys.stderr.write("Script time --> START: %s\n" % (script_time))
sys.stderr.write("Analysis ID: %s\n" % (pid))

if not os.path.exists(bamfile):
    usage()
    sys.exit('BAM file %s not found.' % (bamfile))
if sortbam:
    sys.stderr.write('Sorting BAM file.\n')
    pysam.sort(bamfile, 'sorted_%s' % (pid))
    os.rename(bamfile, bamfile + '_old')
    os.rename('sorted_%s.bam' % (pid), bamfile)
    sys.stderr.write('Indexing BAM file.\n')
    pysam.index(bamfile)
if not os.path.exists(bamfile + '.bai') and not sortbam:
    sys.stderr.write('Indexing BAM file.\n')
    pysam.index(bamfile)
if not os.path.exists(fastafile):
    usage()
    sys.exit('Fasta file %s not found.' % (fastafile))
if not os.path.exists(fastafile + '.fai'):
    sys.stderr.write('Indexing Fasta file.\n')
    pysam.faidx(fastafile)
if not os.path.exists(kfile):
    sys.exit('File containing RNA editing positions not found.')
Esempio n. 58
0
    def sort_bam(self, bamfile, outprefix):
        ''' samtools sort '''
        pysam.sort(bamfile, outprefix)

        return outprefix + ".bam"
Esempio n. 59
0
def get_consensus_report(name,
                         sam_path,
                         ref_path,
                         is_circular,
                         coverage_threshold=0,
                         report_out_dir=None,
                         tmp_files_dir=None):
    basename = os.path.basename(sam_path)
    file_name, ext = os.path.splitext(basename)

    out_dir = tmp_files_dir
    keep_tmp_files = tmp_files_dir is not None
    if not keep_tmp_files:
        out_dir = tempfile.mkdtemp()

    os.makedirs(out_dir, exist_ok=True)
    tmp_sam_path = os.path.join(out_dir, file_name + '_tmp.sam')
    tmp_bam_path = os.path.join(out_dir, file_name + '_tmp.bam')
    bam_path = os.path.join(out_dir, file_name + '.bam')
    mpileup_path = bam_path + '.bam.mpilup'

    logging.info("Split long aligments")
    split_alignments_in_sam(sam_path, tmp_sam_path)

    logging.info("Converting sam to bam")
    pysam.view('-S',
               tmp_sam_path,
               '-b',
               '-o',
               tmp_bam_path,
               catch_stdout=False)

    logging.info("Sorting bam file")
    pysam.sort(tmp_bam_path, '-o', bam_path, catch_stdout=False)

    logging.info("Creating bam index")
    pysam.index(bam_path, '-b')

    logging.info("Creating mpileup")

    mpileup_flags = ['-A', '-B', '-Q', '0']
    if is_circular:
        # use secondary aligments as well
        mpileup_flags.extend(['--ff', '0'])

    pysam.mpileup(*mpileup_flags,
                  '-f',
                  ref_path,
                  bam_path,
                  '-o',
                  mpileup_path,
                  catch_stdout=False)

    logging.info("Generating consensus and report")
    report = process_mpileup(name, sam_path, ref_path, mpileup_path,
                             coverage_threshold, report_out_dir)

    if not keep_tmp_files:
        logging.info("Cleaning tmp files")
        shutil.rmtree(out_dir)

    return report
Esempio n. 60
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = U.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    group = U.OptionGroup(parser, "group-specific options")

    group.add_option(
        "--group-out",
        dest="tsv",
        type="string",
        help="Outfile name for file mapping read id to read group",
        default=None)

    group.add_option(
        "--output-bam",
        dest="output_bam",
        action="store_true",
        default=False,
        help=("output a bam file with read groups tagged using the UG tag"
              "[default=%default]"))

    group.add_option(
        "--output-unmapped",
        dest="output_unmapped",
        action="store_true",
        default=False,
        help=("Retain all unmapped reads in output[default=%default]"))

    parser.add_option("--umi-group-tag",
                      dest="umi_group_tag",
                      type="string",
                      help="tag for the outputted umi group",
                      default='BX')

    parser.add_option_group(group)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = U.Start(parser, argv=argv)

    U.validateSamOptions(options)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        if options.no_sort_output:
            out_name = options.stdout.name
        else:
            out_name = U.getTempFilename()
            sorted_out_name = options.stdout.name
        options.stdout.close()
        assert options.output_bam, (
            "To output a bam you must include --output-bam option")
    else:
        if options.no_sort_output:
            out_name = "-"
        else:
            out_name = U.getTempFilename()
            sorted_out_name = "-"

    if not options.no_sort_output:  # need to determine the output format for sort
        if options.out_sam:
            sort_format = "sam"
        else:
            sort_format = "bam"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "wh"
    else:
        out_mode = "wb"

    infile = pysam.Samfile(in_name, in_mode)

    if options.output_bam:
        outfile = pysam.Samfile(out_name, out_mode, template=infile)
    else:
        outfile = None

    if options.tsv:
        mapping_outfile = U.openFile(options.tsv, "w")
        mapping_outfile.write("%s\n" % "\t".join([
            "read_id", "contig", "position", "gene", "umi", "umi_count",
            "final_umi", "final_umi_count", "unique_id"
        ]))

    nInput, nOutput, unique_id, input_reads, output_reads = 0, 0, 0, 0, 0

    gene_tag = options.gene_tag
    metacontig2contig = None

    if options.chrom:
        inreads = infile.fetch(reference=options.chrom)
    else:
        if options.per_gene and options.gene_transcript_map:
            metacontig2contig = umi_methods.getMetaContig2contig(
                infile, options.gene_transcript_map)
            metatag = "MC"
            inreads = umi_methods.metafetcher(infile, metacontig2contig,
                                              metatag)
            gene_tag = metatag

        else:
            inreads = infile.fetch(until_eof=options.output_unmapped)

    bundle_iterator = umi_methods.get_bundles(
        options,
        all_reads=True,
        return_read2=True,
        return_unmapped=options.output_unmapped,
        metacontig_contig=metacontig2contig)

    for bundle, key, status in bundle_iterator(inreads):

        # write out read2s and unmapped (if these options are set)
        if status == 'single_read':
            # bundle is just a single read here
            nInput += 1

            if outfile:
                outfile.write(bundle)

            nOutput += 1
            continue

        umis = bundle.keys()
        counts = {umi: bundle[umi]["count"] for umi in umis}

        nInput += sum(counts.values())

        while nOutput >= output_reads + 10000:
            output_reads += 10000
            U.info("Written out %i reads" % output_reads)

        while nInput >= input_reads + 1000000:
            input_reads += 1000000
            U.info("Parsed %i input reads" % input_reads)

        # set up UMIClusterer functor with methods specific to
        # specified options.method
        processor = network.UMIClusterer(options.method)

        # group the umis
        groups = processor(umis, counts, threshold=options.threshold)

        for umi_group in groups:
            top_umi = umi_group[0]

            group_count = sum(counts[umi] for umi in umi_group)

            for umi in umi_group:
                reads = bundle[umi]['read']
                for read in reads:
                    if outfile:
                        # Add the 'UG' tag to the read
                        read.tags += [('UG', unique_id)]
                        read.tags += [(options.umi_group_tag, top_umi)]
                        outfile.write(read)

                    if options.tsv:
                        if options.per_gene:
                            gene = read.get_tag(gene_tag)
                        else:
                            gene = "NA"
                        mapping_outfile.write("%s\n" % "\t".join(
                            map(str,
                                (read.query_name, read.reference_name,
                                 umi_methods.get_read_position(
                                     read, options.soft_clip_threshold)[1],
                                 gene, umi.decode(), counts[umi],
                                 top_umi.decode(), group_count, unique_id))))

                    nOutput += 1

            unique_id += 1

    if outfile:
        outfile.close()
        if not options.no_sort_output:
            # sort the output
            pysam.sort("-o", sorted_out_name, "-O", sort_format, out_name)
            os.unlink(out_name)  # delete the tempfile

    if options.tsv:
        mapping_outfile.close()

    # write footer and output benchmark information.
    U.info("Reads: %s" % ", ".join([
        "%s: %s" % (x[0], x[1])
        for x in bundle_iterator.read_events.most_common()
    ]))
    U.info("Number of reads out: %i, Number of groups: %i" %
           (nOutput, unique_id))
    U.Stop()