Esempio n. 1
0
def main():
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-v",
        action="store_true",
        dest="verbose",
        default=False,
        help=
        "Verbose. Prints regularly how many alignments have been processed.")
    (options, args) = parser.parse_args()

    if (len(args) != 3):
        parser.print_help()
        return 1

    bam_reader = pysam.Samfile(args[0], "rb")
    read_length = int(args[1])
    result_dir = args[2]
    if result_dir[-1] != '/':
        result_dir += '/'

    insert_sizes = CountData()  # insert size histogram
    length_deletion_splits = CountData()  # length of deletion splits histogram
    length_insertion_splits = CountData(
    )  # length of insertion splits histogram
    # number of alignments found:
    n_align = 0
    n_align_del_split = 0
    n_align_ins_split = 0

    for align in bam_reader.fetch():
        if align.isize == 0:  # alignment is unmapped
            continue
        if align.isize - 2 * read_length > 0:
            insert_sizes.add(align.isize - 2 * read_length, 1)
        n_align += 1
        if options.verbose and n_align % 100000 == 0:
            print('Having processed %d alignments' % n_align, file=sys.stderr)
        insertion_split_present = False
        deletion_split_present = False
        i = align.pos
        for (cigar_type,
             cigar_length) in align.cigar:  # walk through the cigar string
            if cigar_type == 1:  # insertion
                insertion_split_present = True
                length_insertion_splits.add(cigar_length, 1)
            elif cigar_type == 2:  # deletion
                deletion_split_present = True
                length_deletion_splits.add(cigar_length, 1)
            i += cigar_length
        if deletion_split_present:
            n_align_del_split += 1
        if insertion_split_present:
            n_align_ins_split += 1

    bam_reader.close()

    # print results to file
    insert_sizes.print(open(result_dir + 'histogram-data.insert-sizes', 'w'))
    length_deletion_splits.print(
        open(result_dir + 'histogram-data.length-deletion-splits', 'w'))
    length_insertion_splits.print(
        open(result_dir + 'histogram-data.length-insertion-splits', 'w'))
    print("%d\t%d\t%d" % (n_align, n_align_del_split, n_align_ins_split),
          file=(open(result_dir + 'histogram-data.meta', 'w')))
Esempio n. 2
0
def readThread( samfile_path, Coords, Strand ):
    # <samfile_path>: bam file input for analysis
    # <Coords>: A 3-element object with Chromosome, Start, End of element
    # <ReadLength>: integer of read length
    # <Strand>: True = postive Strand, False = Negative strand
    #
    # Classify each read and it's mate based on their intersection to the TE
    # Upstream   = Evidence of transcription going into a TE
    # Downstream = Evidence of transcription being generated in a TE 
    #
    #     RepeatStart         RepeatEnd
    #           ___________________
    #__________|________TE_________|_______________
    #          |                   |                Read Class
    #  ;====   |                   |                Left
    #        ;====                 |                LeftEdge
    #          |      ;====        |                InExon
    #          |                ;====               RightEdge
    #          |                   |   ;====        Right
    #       ;========================               Span
    #          |                   |
    #          |                   |
    #
    #
    #          |                   |                Read-mate Cases
    # ;====---;====                |                Upstream
    #    ;====---;====             |                Upstream
    #        ;====---;====         |                Upstream
    #          |                   |
    # ;==--;== |                   |                Discard (external)
    #          |                   |  ;==--;==      Discard (external)
    #          |   ;====---;====   |                Discard (internal)
    #   ;====--|-------------------|-;====          Discard (Splice Span)
    #          |                   |
    #          |         ;====---;====              Downstream
    #          |             ;====---;====          Downstream
    #          |                 ;====---;====      Downstream
    #          |                   |
    #       ;========================----;====...   Force Upstream
    # ;====----|-----------------;====              Force Upstream
    #       ;====----------------;====              Force Upstream
    #       ;====------------------|-----;====      Force Upstream             
    #          |                   | 
    #          |                   |
    # Legend:
    # ;     Leftmost position of read (ReadStart or MateStart)
    # ====  Aligned read length (ReadLength)
    # ---   Internal sequence

    # Input Bam File
    samfile = pysam.Samfile( samfile_path, "rb" )

    # Import reads overlapping the element coordinates
    # [0] = Chromosome , [1] = Start, [2] = End
    ParsedCoord = 'chr{0[0]}:{0[1]}-{0[2]}'.format(Coords)
    readIterator = samfile.fetch(region = ParsedCoord)

    #print(ParsedCoord)

    
    # Initialize output
    discardThread = 0
    forceUpThread = 0
    upThread = 0
    downThread = 0

    RepeatStart = int(Coords[1])
    RepeatEnd = int(Coords[2])

    #print(RepeatStart, RepeatEnd)
    
    # Iterate through the reads
    # Assume +ve strand, flip results if negative strand
    for read in readIterator:
        # VERBOSE DEBUG
        # commented out #print
        # read.start,read.end,mate.start,mate.end
        # print(read.pos,(read.pos+read.rlen),"-",read.mpos,(read.mpos + read.rlen),"~")

        # Paired reads on same chromosome only
        if (read.is_paired and read.tid==read.rnext and int(read.mapq)>0):
            # Accessing Mate Information
            # read.pos = read start position (leftmost)
            # readMate = samfile.mate(read)
            # read.mpost = mate start position
            # MateStart = read.mpos

            ReadLength = read.rlen
            ReadStart = read.pos
            ReadEnd = ReadStart + ReadLength

            MateStart = read.mpos
            MateEnd = MateStart + ReadLength

            # START CLASSIFICATION =====================================================
            if ( ReadStart <= RepeatStart ): # (A) TRUE Read starts left of Repeat Start

                if ( ReadEnd > RepeatStart): # (B) TRUE Read Ends right of Repeat Start

                    if ( ReadEnd > RepeatEnd): # (C) TRUE Read Ends right of Repeat End
                        # Case: Read spans the entire repeat
                        # --> Force Upstream
                        forceUpThread = forceUpThread + 1
                        #print("FU 1: Read.ExonInside; Mate.UNK")
                        
                    else: # (C) FALSE Read Ends left of Repeat End
                        # Case: Read is on left edge

                        # Requires mate classification ==
                        if ( MateEnd > RepeatEnd ):
                            # Mate Ends Right of Repeat End

                            if ( MateStart <= RepeatStart):
                                # Mate Starts Left of Repeat Start
                                # Case: Mate spans repeat
                                # --> Discard
                                discardThread = discardThread + 1
                                #print("DIS 1: Read.LeftEdge; Mate.ExonInside")
                            else:
                                # Mate Starts Right of Repaet Start
                                # Mate is on right edge, or right of repeat
                                # --> Force Upstream
                                forceUpThread = forceUpThread + 1
                                #print("FU 2: Read.LeftEdge; Mate.RightEdge,RightOf ")

                        else: # Mate Ends Left of Repeat End
                            # Mate is left, internal or on left edge
                            # --> Upstream
                            upThread = upThread + 1
                            #print("U 1: Read.LeftEdge; Mate.Left,Internal,LeftEdge")
                            
                        # End Mate Logic ================

                else: # (B) FALSE Read Ends left of Repeat Start
                    # Case: Read is completely upstream of repeat
                    # --> Discard
                    discardThread = discardThread + 1
                    #print("DIS 2: Read.UpStream; Mate.UNK")

            else: # (A) FALSE Read start right of Repeat Start

                if (ReadEnd > RepeatEnd): # (C) TRUE Read Ends right of Repeat End

                    if (ReadStart > RepeatEnd): # (D) TRUE Read starts right of Repeat End
                        # Case: Read Starts is downstream of Repeat
                        # --> Discard
                        discardThread = discardThread + 1
                        #print("DIS 3: Read.Downstream; Mate.UNK")

                    else: # (D) FALSE Read ends left of Repeat End
                         # Case: Read is on right edge

                            # Requires mate calssification ==
                        if (MateStart <= RepeatStart): # (A)
                            # Mate starts left of Repeat Start
                            if (MateEnd > RepeatStart): # (B)
                                # Case: Mate is on left edge or spans repeat
                                # --> Discard
                                discardThread = discardThread + 1
                                #print("DIS 4: Read.RightEdge; Mate.LeftEdge,ExonIn")

                            else: # (B) FALSE
                                # Case: Mate is upstream of Repeat
                                # --> Force Upstream
                                forceUpThread = forceUpThread + 1
                                #print(read)
                                #print("FU 3: Read.RightEdge; Mate.Left")

                        else: # (A) False
                            # Case: Mate is internal, rightEdge or right of repeat
                            # --> Downstream
                            downThread = downThread + 1
                            #print("D 1: Read.RightEdge; Mate.Right")

                            # End Mate Logic ================

                else: #(C) FALSE Read Ends left of Repeat End
                    # Case: Read is internal to Repeat
                    # Requires Mate Classification ==
                    if (MateStart > RepeatEnd): # Mate Starts Right of Repeat End (D)
                        # Case: Mate is Right of repeat
                        # --> Upstream
                        downThread = downThread + 1
                        #print("D 2: Read.Internal; Mate.Right")

                    elif (MateEnd <= RepeatStart): # Mate Ends Left of Repeat Start (E)
                        # Case: Mate is right of repeat
                        # --> Downstream
                        upThread = upThread + 1
                        #print("U 2: Read.Internal; Mate.Left")

                    else: 
                        # Mate is rightEdge, internal, leftEdge or spans repeat
                        # --> Discard
                        discardThread = discardThread + 1
                        #print("DIS 5: Read.Internal; Mate.Internal")
                        
                    # End Mate Logic ================
        #  END CLASSIFICATION =============================================================


    #print(discardThread, forceUpThread, upThread, downThread)
        
    # Output
    if (Strand): # True = Positive Strand Orientation
        upThread = upThread + forceUpThread
        localResults = (upThread, downThread)

    else:# False = Negative Strand
        downThread = downThread + forceUpThread
        localResults = (downThread, upThread)

    #print(localResults)
    return(localResults)
Esempio n. 3
0
                bar = progressbar.ProgressBar(maxval=total_reads,
                                              widgets=[
                                                  ' [',
                                                  progressbar.Timer(), '] ',
                                                  progressbar.Bar(), ' (',
                                                  progressbar.ETA(), ') '
                                              ]).start()
            except:
                # if BAM not indexed or some other issue, don't show progress bar
                print(
                    'Could not get total reads (BAM may not be indexed), skipping progress bar...'
                )
                show_progress_bar = False
        read_number = 0

        for read in pysam.Samfile(bam):
            # Progress bar
            if show_progress_bar and (read_number % 100000 == 0
                                      or read_number == total_reads - 1):
                bar.update(read_number)

            read_number += 1

            # Ignore mapped reads (can't belong to guide transcripts...)
            ## unless the user wants all reads processed
            if not read.is_unmapped and not args.all_reads:
                continue

            seq = read.seq.upper()
            tags = dict(read.tags)
            cell = tags.get('CB', None)
Esempio n. 4
0
#		elif len(words)==2 and words[1].split("=")[0]=="/label":
#			region_names[location]=words[1].split("=")[1].replace('"','')

#	for region in regions:
#		try:
#			print regions[region], region_names[region], region_lengths[region]
#		except StandardError:
#			print regions[region], region, region_lengths[region]
	
	print "Finding regions in bam files"
	sys.stdout.flush()
	for filename in args:
		print "\t"+filename+"..."
		sys.stdout.flush()
		if filename.split(".")[-1]=="bam":
			samfile = pysam.Samfile( filename, "rb" )
		elif filename.split(".")[-1]=="sam":
			samfile = pysam.Samfile( filename, "r" )
		else:
			print filename, "not a readable bam file"
			continue
		
		refs=samfile.references
		lengths=samfile.lengths
		
		if len(refs)!=len(refseqs):
			DoError("bam has different number of reference sequences to reference fasta file")
		else:
			for ref in refs:
				if not ref in refseqs:
					DoError("bam and reference fasta file do not match")
def sam_to_allele_counts(sam_fname, paired=False, qual_min=30, max_reads=-1,
                         max_isize = 700, VERBOSE = 0,
                         fwd_primer_regions = None, rev_primer_regions = None):
    '''
    calculates the allele counts for a set of mapped reads
    parameters:
    sam_fname   --   sam or bam file with mapped reads
    paired      --   differentiates between read one or two if True
                     otherwise the two reads are lumped together
    max_isize   --   maximal insert sizes to consider. this can be used to remove artifactual mappings
    qual_min    --   Ignore bases with quality less than qmin
    '''
    import pysam
    from collections import defaultdict

    alpha = nuc_alpha

    def ac_array(length, paired):
        if paired:
            return np.zeros((2,2,6,length), dtype =int)
        else:
            return np.zeros((2,6,length), dtype = int)

    # Note: the data structure for inserts is a nested dict with:
    # position --> string --> read type --> count
    #  (dict)      (dict)       (list)      (int)
    def insertion_data_structure(paired):
        if paired:
            return defaultdict(lambda: defaultdict(lambda: np.zeros((2,2), int)))
        else:
            return defaultdict(lambda: defaultdict(lambda: np.zeros(2, int)))


    # Open BAM or SAM file
    with pysam.Samfile(sam_fname) as samfile:
        ac =  []
        refs = {}
        for nref in xrange(samfile.nreferences):
            if VERBOSE: print("allocating for:", samfile.getrname(nref), "length:", samfile.lengths[nref])
            refs[nref]=samfile.getrname(nref)
            ac.append((samfile.getrname(nref), ac_array(samfile.lengths[nref], paired),
                        insertion_data_structure(paired)))

        # Iterate over single reads
        for i, read in enumerate(samfile):
            # Max number of reads
            if i == max_reads:
                if VERBOSE >= 2:
                    print('Max reads reached:', max_reads)
                break

            if read.is_unmapped or np.abs(read.isize)>max_isize or read.is_secondary or read.is_supplementary:
                continue

            # Print output
            if (VERBOSE > 2) and (not ((i +1) % 10000)):
                print(i+1)

            # Read CIGARs (they should be clean by now)
            if paired:
                counts = ac[read.rname][1][int(read.is_read2),int(read.is_reverse)]
                insertion = ac[read.rname][2]
            else:
                counts = ac[read.rname][1][int(read.is_reverse)]
                insertion = ac[read.rname][2]

            seq = np.fromstring(read.seq, 'S1')
            qual = np.fromstring(read.qual, np.int8) - 33
            not_primer = np.ones_like(seq, 'bool')
            pos = read.pos
            # all legit reads should be FR or RF!
            if rev_primer_regions:
                if read.is_reverse or np.abs(read.isize)==seq.shape[0]:
                    read_end = pos + seq.shape[0]
                    for b,e in rev_primer_regions[refs[read.rname]]:
                        p_length = e-b
                        if read_end-b>0 and read_end-b<p_length:
                            not_primer[-(read_end-b):]=False
                            break

            if fwd_primer_regions:
                if (not read.is_reverse) or np.abs(read.isize)==seq.shape[0]:
                    for b,e in fwd_primer_regions[refs[read.rname]]:
                        p_length = e-b
                        if pos-b>0 and pos-b<p_length:
                            not_primer[:e-pos]=False
                            break

            # if pos+len(seq)>7267:
            # 	import ipdb;ipdb.set_trace()
            # Iterate over CIGARs
            for ic, (block_type, block_len) in enumerate(read.cigar):
                if block_type==4: # softclip
                    seq = seq[block_len:]
                    qual = qual[block_len:]
                    # not the difference here: the reported position starts after the softclip. hence the not_primer is already correct
                    not_primer = not_primer[:-block_len]
                    continue
                if block_type==5: # hard clip
                    continue

                # Check for pos: it should never exceed the length of the fragment
#                if (block_type in [0, 1, 2]) and (pos >= length):
#                    raise ValueError('Pos exceeded the length of the fragment')

                # Inline block
                if block_type == 0:
                    seqb = seq[:block_len]
                    qualb = qual[:block_len]
                    not_primerb = not_primer[:block_len]
                    # Increment counts
                    for j, a in enumerate(alpha):
                        posa = ((seqb == a) & (qualb >= qual_min) & (not_primerb)).nonzero()[0]
                        if len(posa):
                            counts[j,pos + posa] += 1

                    # Chop off this block
                    if ic != len(read.cigar) - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]
                        not_primer = not_primer[block_len:]
                        pos += block_len

                # Deletion
                elif block_type == 2:
                    # Increment gap counts
                    counts[4, pos:pos + block_len] += 1
                    # Chop off pos, but not sequence
                    pos += block_len

                # Insertion
                # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                # THEN the insert, FINALLY comes seq[391:]
                elif block_type == 1:
                    seqb = seq[:block_len]
                    qualb = qual[:block_len]
                    not_primerb = not_primer[:block_len]
                    # Accept only high-quality inserts
                    if (qualb >= qual_min).all():
                        if paired:
                            insertion[pos][seqb.tostring()][int(read.is_read2), int(read.is_reverse)] += 1
                        else:
                            insertion[pos][seqb.tostring()][int(read.is_reverse)] += 1

                    # Chop off seq, but not pos
                    if ic != len(read.cigar) - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]
                        not_primer = not_primer[block_len:]

                # Other types of cigar?
                else:
                    if VERBOSE>2:
                        print("unrecognized CIGAR type:", read.cigarstring)
                    #raise ValueError('CIGAR type '+str(block_type)+' not recognized')

    return ac
Esempio n. 6
0
def estimateInsertSizeDistribution(bamfile,
                                   alignments=10000,
                                   n=10,
                                   method="picard",
                                   similarity_threshold=1.0,
                                   max_chunks=1000):
    '''estimate insert size from a subset of alignments in a bam file.

    Several methods are implemented.

    picard
        The method works analogous to picard by restricting the estimates
        to a core distribution. The core distribution is defined as all
        values that lie within n-times the median absolute deviation of
        the full data set.
    convergence
        The method works similar to ``picard``, but continues reading
        `alignments` until the mean and standard deviation stabilize.
        The values returned are the median mean and median standard
        deviation encountered.

    The method `convergence` is suited to RNA-seq data, as insert sizes
    fluctuate siginificantly depending on the current region
    being looked at.

    Only mapped and proper pairs are considered in the computation.

    Returns
    -------
    mean : float
       Mean of insert sizes.
    stddev : float
       Standard deviation of insert sizes.
    npairs : int
       Number of read pairs used for the estimation
    method : string
       Estimation method
    similarity_threshold : float
       Similarity threshold to apply.
    max_chunks : int
       Maximum number of chunks of size `alignments` to be used
       in the convergence method.

    '''

    assert isPaired(bamfile), \
        'can only estimate insert size from' \
        'paired bam files'

    samfile = pysam.Samfile(bamfile)

    def get_core_distribution(inserts, n):
        # compute median absolute deviation
        raw_median = numpy.median(inserts)
        raw_median_dev = numpy.median(numpy.absolute(inserts - raw_median))

        # set thresholds
        threshold_min = max(0, raw_median - n * raw_median_dev)
        threshold_max = raw_median + n * raw_median_dev

        # define core distribution
        return inserts[numpy.logical_and(inserts >= threshold_min,
                                         inserts <= threshold_max)]

    if method == "picard":

        # only get first read in pair to avoid double counting
        inserts = numpy.array([
            read.template_length for read in samfile.head(n=alignments)
            if read.is_proper_pair and not read.is_unmapped
            and not read.mate_is_unmapped and not read.is_read1
            and not read.is_duplicate and read.template_length > 0
        ])
        core = get_core_distribution(inserts, n)

        return numpy.mean(core), numpy.std(core), len(inserts)

    elif method == "convergence":

        means, stds, counts = [], [], []
        last_mean = 0
        iteration = 0
        while iteration < max_chunks:

            inserts = numpy.array([
                read.template_length
                for read in samfile.head(n=alignments,
                                         multiple_iterators=False)
                if read.is_proper_pair and not read.is_unmapped
                and not read.mate_is_unmapped and not read.is_read1
                and not read.is_duplicate and read.template_length > 0
            ])
            core = get_core_distribution(inserts, n)
            means.append(numpy.mean(core))
            stds.append(numpy.std(core))
            counts.append(len(inserts))
            mean_core = get_core_distribution(numpy.array(means), 2)
            mm = numpy.mean(mean_core)
            if abs(mm - last_mean) < similarity_threshold:
                break
            last_mean = mm

        return numpy.median(means), numpy.median(stds), sum(counts)
    else:
        raise ValueError("unknown method '%s'" % method)
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-bam",
                      dest="input_bam_file",
                      type="string",
                      help="input bam file [%default]")

    parser.add_option("-o",
                      "--output-bam",
                      dest="output_bam_file",
                      type="string",
                      help="input bam file [%default].")

    parser.add_option("-r",
                      "--max-read-length",
                      dest="max_read_length",
                      type="int",
                      help="maximum read length [%default].")

    parser.add_option(
        "-m",
        "--output-mode",
        dest="output_mode",
        type="choice",
        choices=["buffered", "direct"],
        help="output mode for files. 'buffered' will output reads in correct "
        "sort order, 'direct' will require the output BAM file to be sorted separately."
        "[%default].")

    parser.add_option(
        "--region",
        dest="region",
        type="string",
        help=
        "genomic region, only split in BAM file within this region [%default]."
    )

    parser.set_defaults(
        input_bam_file="-",
        output_bam_file="-",
        max_read_length=100,
        default_quality_score=10,
        region=None,
        output_mode="buffered",
    )

    (options, args) = E.start(parser, argv)

    pysam_in = pysam.Samfile(options.input_bam_file, "rb")
    pysam_out = pysam.Samfile(options.output_bam_file, "wb", template=pysam_in)

    max_read_length = options.max_read_length

    bam2bam_split_reads(pysam_in,
                        pysam_out,
                        default_quality_score=options.default_quality_score,
                        max_read_length=options.max_read_length,
                        output_mode=options.output_mode)

    E.stop()
Esempio n. 8
0
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input",
        action="store",
        type="string",
        dest="input_files",
        help=
        'Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]'
    )

    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="ref_gene_model",
        help=
        "Reference gene model in BED format. Must be strandard 12-column BED file. [required]"
    )

    parser.add_option(
        "-c",
        "--minCov",
        action="store",
        type="int",
        dest="minimum_coverage",
        default=10,
        help="Minimum number of read mapped to a transcript. default=%default")

    parser.add_option(
        "-n",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=100,
        help=
        "Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default"
    )

    parser.add_option(
        "--names",
        dest="sample_names",
        action="store",
        type="string",
        help=
        "sample names, comma separated (no spaces allowed); number must match the number of provided bam_files"
    )

    parser.add_option(
        "-s",
        "--subtract-background",
        action="store_true",
        dest="subtract_bg",
        help=
        "Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads."
    )

    (options, args) = parser.parse_args()

    # if '-s' was set
    if options.subtract_bg:
        exon_ranges = union_exons(options.ref_gene_model)

    if options.sample_size < 0:
        print >> sys.stderr, "Number of nucleotide can't be negative"
        sys.exit(0)
    elif options.sample_size > 1000:
        print >> sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow."

    if not (options.input_files and options.ref_gene_model):
        parser.print_help()
        sys.exit(0)

    if not os.path.exists(options.ref_gene_model):
        print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
        parser.print_help()
        sys.exit(0)

    printlog("Get BAM file(s) ...")
    bamfiles = options.input_files.split(",")

    if len(bamfiles) <= 0:
        print >> sys.stderr, "No BAM file found, exit."
        sys.exit(0)
    else:
        print >> sys.stderr, "Total %d BAM file(s):" % len(bamfiles)
        for f in bamfiles:
            print >> sys.stderr, "\t" + f

    names = options.sample_names.split(",")
    if len(names) != len(bamfiles):
        print >> sys.stderr, "[ERROR] Number of bam files does not match number of names"
        sys.exit(2)

# print header
    sys.stdout.write("transcript")
    for i in names:
        sys.stdout.write("\t%s" % i)
    print >> sys.stdout, ""

    sample_TINS_per_transcript = {}

    for f_idx in range(len(bamfiles)):
        f = bamfiles[f_idx]
        printlog("Processing " + f)

        sample_name = names[f_idx]

        samfile = pysam.Samfile(f, "rb")
        finish = 0
        noise_level = 0.0
        for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions(
                refbed=options.ref_gene_model,
                sample_size=options.sample_size):
            finish += 1

            if gname not in sample_TINS_per_transcript:
                sample_TINS_per_transcript[gname] = []

            # check minimum reads coverage
            if check_min_reads(samfile, i_chr, i_tx_start, i_tx_end,
                               options.minimum_coverage) is not True:
                sample_TINS_per_transcript[gname].append(0.0)
                continue

            # estimate background noise if '-s' was specified
            if options.subtract_bg:
                intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end,
                                                   samfile, exon_ranges)
                if intron_size > 0:
                    noise_level = intron_signals / intron_size

            coverage = genebody_coverage(samfile, i_chr,
                                         sorted(pick_positions), noise_level)

            # test -->
            #for a,b in zip(sorted(pick_positions),coverage):
            #	print >>sys.stderr, str(a) + '\t' + str(b)
            # <-- test

            tin1 = tin_score(cvg=coverage, l=len(pick_positions))
            sample_TINS_per_transcript[gname].append(tin1)
            if finish % 500 == 0:
                print >> sys.stderr, " %d transcripts finished" % (finish)

        samfile.close()

# print table
    for ex in sample_TINS_per_transcript:
        print >> sys.stdout, "%s\t%s" % (ex, "\t".join(
            map(str, sample_TINS_per_transcript[ex])))
Esempio n. 9
0
is_dupe = 0x400  #PCR or optical duplicate


def reheader(in_sam, keepstr='dmel'):
    new_header = in_sam.header.copy()
    new_SQ = [
        sub_dict for sub_dict in new_header['SQ'] if keepstr in sub_dict['SN']
    ]
    new_header['SQ'] = new_SQ
    return new_header


#Operate on each file independently
for fname in sys.argv[1:]:
    data = defaultdict(lambda: [None, None])
    infile = pysam.Samfile(fname)
    outfile = pysam.Samfile(fname[:-4] + '_rescued_unsorted.bam',
                            'wb',
                            header=reheader(infile, fname[-8:-4]))
    irefs = infile.references
    orefs = outfile.references
    maxval, start = get_bam_length(infile)  # For progress bar goodness

    pbar = ProgressBar(
        maxval=maxval - start,
        widgets=[fname, ': ',
                 Percentage(), ' ',
                 Bar(), ' ',
                 ETA(), ' '])
    pbar.start()
Esempio n. 10
0
import sys
import pysam
from uuid import uuid4

inbamfn = sys.argv[1]
outbamfn = sys.argv[2]

inbam = pysam.Samfile(inbamfn, 'rb')
outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

paired = {}

n = 0
p = 0
u = 0
w = 0
m = 0

for read in inbam.fetch(until_eof=True):
    n += 1
    if read.is_paired:
        p += 1
        if read.qname in paired:
            uuid = paired[read.qname]
            del paired[read.qname]
            read.qname = uuid
            outbam.write(read)
            w += 1
            m += 1
        else:
            newname = str(uuid4())
Esempio n. 11
0
def main():
    prepare_optparser()
    (options, args) = prepare_optparser().parse_args()
    try:
        raw_bam = args[0]
        ref_Splice = options.ref_Splice
        out_file = options.out_file
    except IndexError:
        prepare_optparser().print_help()
        sys.exit(1)

    if not os.path.isfile("%s.bai" % (raw_bam)):
        shell_info = "samtools index %s" % (raw_bam)
        print >> sys.stderr, shell_info
        p = subprocess.Popen(shell_info, shell='True')
        while 1:
            run_cnt = 0
            if p.poll() is None:
                run_cnt += 1
                time.sleep(3)
            if run_cnt == 0:
                break

    f_sam = pysam.Samfile(raw_bam, "rb")
    f_refSplice = open(ref_Splice, "r")
    f_out_file = open(out_file, "w")

    total_circ = 0
    pass_PE_circ = 0
    notP_PE_circ = 0

    total_read = 0
    pass_PE_read = 0
    notP_PE_read = 0

    for line in f_refSplice:
        line = line.strip('\n')
        f = line.split()
        chrom = f[0]
        beg = int(f[1])
        end = int(f[1]) + 1

        cnt_junc = 0
        cnt_linear = 0

        record = f_sam.fetch(reference=chrom, start=beg - 1, end=beg + 1)
        for rec in record:
            rec_beg = rec.reference_start
            idx = rec_beg - beg
            for pair in rec.cigar:
                ctag = pair[0]
                leng = pair[1]
                pos = beg + idx

                is_linear = 0
                is_junction = 0
                if ctag == 0:
                    for i in xrange(leng):
                        is_overlap = is_intersect(beg, end, beg + idx)
                        if is_overlap:
                            is_linear = 1
                        idx += 1

                elif ctag == 1:
                    continue

                elif ctag == 2:
                    for i in xrange(leng):
                        idx += 1

                elif ctag == 3:
                    for i in xrange(leng):
                        is_overlap = is_intersect(beg, end, beg + idx)
                        if is_overlap:
                            #                            print rec, i
                            if i < 2 or (leng - i) < 2:
                                is_junction = 1
                        idx += 1

                cnt_junc += is_junction
                cnt_linear += is_linear

        print >> f_out_file, "%s\t%d\t%d" % (line, cnt_junc, cnt_linear)

    f_refSplice.close()
    f_sam.close()

    f_out_file.close()
Esempio n. 12
0
        ATCGmap = gzip.open(ATCGmap_fname, 'wb')

        CGmap_fname = options.CGmap_file or ((options.output_prefix or options.infilename) + '.CGmap.gz')
        CGmap = gzip.open(CGmap_fname, 'wb')

    # to improve the performance
    options_RM_CCGG = options.RM_CCGG
    options_read_no = options.read_no
    options_RM_SX = options.RM_SX
    options_RM_OVERLAP = options.RM_OVERLAP

    wiggle_fname = options.wig_file or ((options.output_prefix or options.infilename) + '.wig')
    wiggle = open(wiggle_fname, 'w')
    wiggle.write('type wiggle_0\n')

    sorted_input = pysam.Samfile(sorted_input_filename, 'rb')
    
    chrom = None
    nucs = ['A', 'T', 'C', 'G', 'N']
    ATCG_fwd = dict((n, 0) for n in nucs)
    ATCG_rev = dict((n, 0) for n in nucs)

    # Define the context and subcontext exchanging dictionary
    ContextTable={"CAA":"CHH", "CAC":"CHH", "CAG":"CHG", "CAT":"CHH",
                  "CCA":"CHH", "CCC":"CHH", "CCG":"CHG", "CCT":"CHH",
                  "CGA":"CG",  "CGC":"CG",  "CGG":"CG",  "CGT":"CG",
                  "CTA":"CHH", "CTC":"CHH", "CTG":"CHG", "CTT":"CHH"}
    #
    SubContextTable={"CAA":"CA", "CAC":"CA", "CAG":"CA", "CAT":"CA",
                     "CCA":"CC", "CCC":"CC", "CCG":"CC", "CCT":"CC",
                     "CGA":"CG", "CGC":"CG", "CGG":"CG", "CGT":"CG",
Esempio n. 13
0
def bam2bed(bam_file, bed_file, track_header, interact_file, q_cut):
    '''
	Convert BAM file into standard BED file. The alignment blocks of paired-end reads will be
	merged into a single BED entry (row).
	
	
	Parameters
	----------
	bam_file : str
		Name of input BAM file.
		
	bed_file : str
		Name of output BED file.
	
	interact_file : str
		Name of the output Interact file.
		
	track_header : bool
		If True, add track header line to the bed file. 
		
	q_cut : int
		Mapping quality score cutoff. 
	'''

    #key is read_id, value is list of "aligned gapless blocks"
    block_list = {}
    samfile = pysam.Samfile(bam_file, 'rb')
    strandness = collections.defaultdict(list)
    interChrom_IDs = set()
    try:
        while (1):
            aligned_read = next(samfile)
            read_type = aligned_read.get_tag(
                tag="SR"
            )  #1 : only supported by split read; 2: only supported by read pair; 3: supported by both
            fusion_name = aligned_read.get_tag(tag="FN")

            chrom = aligned_read.reference_name
            if aligned_read.is_paired:
                mate_chrom = aligned_read.next_reference_name

            if aligned_read.is_reverse:
                s = '-'
            else:
                s = '+'

            read_id = aligned_read.query_name
            if read_id.endswith('/1') or read_id.endswith('/2'):
                read_id = read_id[:-2]  #remove last 2 chars
            # In the output BED file
            # for intra chrom fusions, pair reads will be merged into a single bed record
            # for inter chrom fusions, pair reads will each has its own separated record
            key = fusion_name + '_@_' + read_id + ' ' + chrom
            strandness[key].append(s)
            if key not in block_list:
                block_list[key] = aligned_read.get_blocks(
                )  #[(46650521, 46650555), (46650631, 46650645)]; chr1    46650522        66      34M76N14M
            else:
                block_list[key].extend(aligned_read.get_blocks())

            if chrom != mate_chrom:
                interChrom_IDs.add(fusion_name + '_@_' + read_id)
    except StopIteration:
        print("Done", file=sys.stderr)
        samfile.seek(0)

    #key is read_id, value is list of sorted, non-consecutive "aligned gapless blocks"
    sorted_block_list = {}
    for k, v in block_list.items():
        #print (k + '\t' + str(v))
        tmp = sorted(list(set(v)), key=lambda tup: tup[0]
                     )  #remove redundancy; sort coordinates (small to large)
        tmp = [list(i) for i in tmp]  #tuple list to list list

        sorted_block_list[k] = list(consec(tmp))  #combine consecutive regions

    #####################################
    # convert sorted block list into BED
    #####################################
    OUT = open(bed_file, 'w')
    if track_header:
        print(
            'track name="Supporting reads of Intra-Chrom gene fusion" description="Alignment blocks from the paired reads were combined" visibility=2 itemRgb="On"',
            file=OUT)
    for id, blocks in sorted_block_list.items():
        (name, chrom) = id.split(' ')
        chromStart = blocks[0][0]
        chromEnd = blocks[-1][-1]
        score = 0

        if id in strandness:
            tmp = list(set(strandness[id]))
            if len(tmp) == 1:
                strand = tmp[0]
            else:
                strand = '.'
        else:
            strand = '.'

        #split read : orange
        if read_type == 1:
            itemRgb = '255,128,0'
        #paired read : blue
        elif read_type == 2:
            itemRgb = '0,102,204'
        #both : red
        elif read_type == 3:
            itemRgb = '255,0,0'
        else:
            itemRgb = '0,0,0'

        thickStart = chromStart
        thickEnd = chromEnd

        blockCount = len(blocks)
        blockSizes = ','.join([str(i[1] - i[0]) for i in blocks])
        blockStarts = ','.join([str(i[0] - chromStart) for i in blocks])

        bed_blocks = [
            chrom, chromStart, chromEnd, name, score, strand, thickStart,
            thickEnd, itemRgb, blockCount, blockSizes, blockStarts
        ]
        print("\t".join([str(i) for i in bed_blocks]), file=OUT)
    OUT.close()

    #####################################
    # convert BED into Interact
    #####################################
    OUT2 = open(interact_file, 'w')
    print(
        'track type=interact name="Supporting reads of gene fusions" description="Connection map of gene fusion reads" maxHeightPixels=200:200:50 visibility=full',
        file=OUT2)
    InterChrom_list = collections.defaultdict(
        list
    )  #k is read_id, value is list of tuples (chrom, st, end, name, score, strand)
    for line in open(bed_file, 'r'):
        if line.startswith('#'): continue
        if line.startswith('track'): continue
        if line.startswith('browser'): continue

        exon_blocks = []
        f = line.strip().split()
        chrom = f[0]
        chrom_start = int(f[1])
        name = f[3]
        #sourceName,targetName = name.split('_@_')[0].split('--')
        score = f[4]
        strand = f[5]
        blockSizes = [int(i) for i in f[10].strip(',').split(',')]
        blockStarts = [
            chrom_start + int(i) for i in f[11].strip(',').split(',')
        ]
        for base, offset in zip(blockStarts, blockSizes):
            exon_blocks.append([chrom, base, base + offset, strand])

        if name not in interChrom_IDs:
            for indx in range(0, len(exon_blocks) - 1):
                block1 = exon_blocks[indx]
                block2 = exon_blocks[indx + 1]
                chrom = chrom
                chromStart = min(block1[1], block2[1])
                chromEnd = max(block1[2], block2[2])
                name = name
                score = 100
                value = 100.0
                exp = 'Intra_Chrom_fusion'
                color = 'blue'
                sourceChrom, sourceStart, sourceEnd, sourceStrand = block1
                sourceName = sourceChrom + ':' + str(sourceStart) + '-' + str(
                    sourceEnd)

                targetChrom, targetStart, targetEnd, targetStrand = block2
                targetName = targetChrom + ':' + str(targetStart) + '-' + str(
                    targetEnd)

                print("\t".join([
                    str(i)
                    for i in (chrom, chromStart, chromEnd, name, score, value,
                              exp, color, sourceChrom, sourceStart, sourceEnd,
                              sourceName, sourceStrand, targetChrom,
                              targetStart, targetEnd, targetName, targetStrand)
                ]),
                      file=OUT2)

        else:
            InterChrom_list[name].append(exon_blocks)

    for k, v in InterChrom_list.items():

        name = k
        #print (name)
        score = 100
        value = 100.0
        exp = 'Inter_Chrom_fusion'
        color = 'red'

        if len(v) != 2:  #paired reads must have two block_list
            continue
        for block1 in v[0]:
            sourceChrom, sourceStart, sourceEnd, sourceStrand = block1
            sourceName = sourceChrom + ':' + str(sourceStart) + '-' + str(
                sourceEnd)
            for block2 in v[1]:
                targetChrom, targetStart, targetEnd, targetStrand = block2
                targetName = targetChrom + ':' + str(targetStart) + '-' + str(
                    targetEnd)
                print("\t".join([
                    str(i)
                    for i in (sourceChrom, sourceStart, sourceEnd, name, score,
                              value, exp, color, targetChrom, targetStart,
                              targetEnd, targetName, targetStrand, sourceChrom,
                              sourceStart, sourceEnd, sourceName, sourceStrand)
                ]),
                      file=OUT2)

    OUT2.close()
Esempio n. 14
0
import string
import pysam

# ------------------------------------
# constants
# ------------------------------------

# ------------------------------------
# Misc functions
# ------------------------------------

# ------------------------------------
# Classes
# ------------------------------------

# ------------------------------------
# Main
# ------------------------------------

if __name__ == "__main__":
    if len(sys.argv) == 1:
        sys.exit("Example:" + sys.argv[0] + " *.bam/sam >*.bed ")
    sam = pysam.Samfile(sys.argv[1], sys.argv[1].endswith("bam") and 'rb'
                        or 'r')
    for read in sam:
        if not read.is_unmapped:
            print "%s\t%d\t%d\t%s\t%d\t%s" % (
                sam.references[read.tid], read.pos, read.aend, read.qname,
                read.mapq, read.is_reverse and "-" or "+")
    sam.close()
Esempio n. 15
0
    action='store',
    type=str,
    default=None)
parser.add_argument(
    '--cbed',
    dest='cbed',
    help=
    "Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion",
    metavar='FILE',
    action='store',
    type=str,
    default=None)

args = parser.parse_args()
if os.path.splitext(args.bam[0])[-1] == '.cram':
    bamFile = pysam.Samfile(args.bam[0], 'rc')
else:
    bamFile = pysam.Samfile(args.bam[0], 'rb')
cbam = None
if args.cbam is not None:
    if os.path.splitext(args.cbam[0])[-1] == '.cram':
        cbam = pysam.Samfile(args.cbam, 'rc')
    else:
        cbam = pysam.Samfile(args.cbam, 'rb')
cbed = args.cbed

coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats")
cstats = None
cb = bamFile
if cbam is not None:
    cb = cbam
Esempio n. 16
0
def filter(cfg,bamFileIn,bamFileOut):
    print("consensus filter: starting...")
    
    # get params
    deleteLocalFiles = cfg.deleteLocalFiles
 
    # constants for read pair accounting
    NUM_PRIMER_SIDE_NOT_MAPPED = 0
    NUM_RANDOM_SIDE_NOT_MAPPED = 1
    NUM_R1_R2_NOT_AT_SAME_LOCUS = 2
    NUM_R1_R2_SAME_ORIENTATION = 3
    NUM_SPLIT_ALIGNMENT = 4
    NUM_LOW_MAPQ = 5
    NUM_LT_25BP_ALIGNED = 6
    NUM_WRITTEN_OUT = 7
    NUM_METRICS_TOTAL = 8
    
    # open BAM read alignment files
    bamIn  = pysam.Samfile(bamFileIn , "rb")
    bamOut = pysam.Samfile(bamFileOut, "wb", template=bamIn)
 
    # loop over read alignments
    readPairCounts = [0] * NUM_METRICS_TOTAL
    for read in bamIn:
    
        # this is dangerous, but drop these for now
        if read.is_secondary or read.is_supplementary:
            continue
   
        # crash if read is not paired
        if not read.is_paired:
            print((read.qname))
            raise Exception("read not paired!")
         
        # this should be R1      
        read1 = read
        
        # get mate, assuming mate is the next record in the BAM file
        while True:
            read = next(bamIn)
            if not read.is_secondary and not read.is_supplementary:
                break
                
        # this should be R2
        read2 = read
        
        # debug check
        if read1.qname != read2.qname:
            print((read1.qname, read2.qname))
            raise Exception("read mate is not next in BAM record order!")
            
        # debug check 
        if not read1.is_read1 or not read2.is_read2:
            raise Exception("R1/R2 mixed up!")
        
        # skip but count unmapped R1 reads, even if R2 mapped.  Need to look at these later...
        if read1.is_unmapped:
            readPairCounts[NUM_PRIMER_SIDE_NOT_MAPPED] += 1
            continue
         
        # skip but count unmapped R2 reads
        if read2.is_unmapped:
            readPairCounts[NUM_RANDOM_SIDE_NOT_MAPPED] += 1
            continue
            
        # skip reads not mapped to same chrom
        chrom1 = bamIn.getrname(read1.tid)
        chrom2 = bamIn.getrname(read2.tid)
        if chrom1 != chrom2:
            readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS] += 1
            continue
            
        # skip reads not mapped to same locus
        locRead1 = int(read1.aend) - 1 if read1.is_reverse else read1.pos
        locRead2 = int(read2.aend) - 1 if read2.is_reverse else read2.pos
        if abs(locRead1 - locRead2) > 2000:
            readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS] += 1
            continue
   
        # skip pairs with odd alignment orientation
        if read1.is_reverse == read2.is_reverse:
            readPairCounts[NUM_R1_R2_SAME_ORIENTATION] += 1
            continue
         
        # drop read pair if either end has a supplementary split alignment
        if read1.has_tag("SA") or read2.has_tag("SA"):
            readPairCounts[NUM_SPLIT_ALIGNMENT] += 1
            continue
               
        # drop read pair if R1 or R2 read has low mapq 
        if read2.mapq < 17 or read1.mapq < 17:
            readPairCounts[NUM_LOW_MAPQ] += 1
            continue
            
        # require some significant alignment to genome
        if read2.aend - read2.pos < 25 or read1.aend - read1.pos < 25:
            readPairCounts[NUM_LT_25BP_ALIGNED] += 1
            continue
         
        # output
        bamOut.write(read1)
        bamOut.write(read2)
        readPairCounts[NUM_WRITTEN_OUT] += 1
        
    # done
    bamOut.close()
    bamIn.close()
    
    # delete input BAM file if local
    if deleteLocalFiles and len(os.path.dirname(bamFileIn)) == 0:
        os.remove(bamFileIn)
     
    # report drop totals
    print(("{} read fragments dropped, primer side read not mapped".format(readPairCounts[NUM_PRIMER_SIDE_NOT_MAPPED])))
    print(("{} read fragments dropped, random side read not mapped".format(readPairCounts[NUM_RANDOM_SIDE_NOT_MAPPED])))
    print(("{} read fragments dropped, R1 and R2 not mapped to same locus".format(readPairCounts[NUM_R1_R2_NOT_AT_SAME_LOCUS])))
    print(("{} read fragments dropped, FF or RR mapping orientation".format(readPairCounts[NUM_R1_R2_SAME_ORIENTATION])))
    print(("{} read fragments dropped, split alignment".format(readPairCounts[NUM_SPLIT_ALIGNMENT])))
    print(("{} read fragments dropped, low mapping quality MAPQ < 17".format(readPairCounts[NUM_LOW_MAPQ])))
    print(("{} read fragments dropped, less than 25 bp aligned to genome".format(readPairCounts[NUM_LT_25BP_ALIGNED])))
    print(("{} read fragments written".format(readPairCounts[NUM_WRITTEN_OUT])))
Esempio n. 17
0
def getNumberOfAlignments(bamfile):
    '''return number of alignments in bamfile.
    '''
    samfile = pysam.Samfile(bamfile)
    return samfile.mapped
Esempio n. 18
0
def main():
    description = """
disambiguate.py disambiguates between two organisms that have alignments
from the same source of fastq files. An example where this might be
useful is as part of an explant RNA/DNA-Seq workflow where an informatics
approach is used to distinguish between human and mouse RNA/DNA reads.

For reads that have aligned to both organisms, the functionality is based on
comparing quality scores from either Tophat of BWA. Read
name is used to collect all alignments for both mates (_1 and _2) and
compared between human and mouse alignments.

For Tophat (default, can be changed using option -a) and Hisat2, the sum of the tags XO,
NM and NH is evaluated and the lowest sum wins the paired end reads. For equal
scores (both mates, both species), the reads are assigned as ambiguous.

The alternative algorithm (STAR, bwa) disambiguates (for aligned reads) by tags
AS (alignment score, higher better), followed by NM (edit distance, lower 
better).

The output directory will contain four files:\n
...disambiguatedSpeciesA.bam: Reads that could be assigned to species A
...disambiguatedSpeciesB.bam: Reads that could be assigned to species B
...ambiguousSpeciesA.bam: Reads aligned to species A that also aligned \n\tto B but could not be uniquely assigned to either
...ambiguousSpeciesB.bam: Reads aligned to species B that also aligned \n\tto A but could not be uniquely assigned to either
..._summary.txt: A summary of unique read names assigned to species A, B \n\tand ambiguous.

Examples:
disambiguate.py test/human.bam test/mouse.bam
disambiguate.py -s mysample1 test/human.bam test/mouse.bam
   """

    parser = ArgumentParser(description=description,
                            formatter_class=RawTextHelpFormatter)
    parser.add_argument('A', help='Input BAM file for species A.')
    parser.add_argument('B', help='Input BAM file for species B.')
    parser.add_argument('-o',
                        '--output-dir',
                        default="disambres",
                        help='Output directory.')
    parser.add_argument('-i',
                        '--intermediate-dir',
                        default="intermfiles",
                        help='Location to store intermediate files')
    parser.add_argument(
        '-d',
        '--no-sort',
        action='store_true',
        default=False,
        help='Disable BAM file sorting. Use this option if the '
        'files have already been name sorted.')
    parser.add_argument(
        '-s',
        '--prefix',
        default='',
        help='A prefix (e.g. sample name) to use for the output '
        'BAM files. If not provided, the input BAM file prefix '
        'will be used. Do not include .bam in the prefix.')
    parser.add_argument('-a',
                        '--aligner',
                        default='tophat',
                        choices=('tophat', 'hisat2', 'bwa', 'star'),
                        help='The aligner used to generate these reads. Some '
                        'aligners set different tags.')
    args = parser.parse_args()

    #code
    numhum = nummou = numamb = 0
    #starttime = time.clock()
    # parse inputs
    humanfilename = args.A
    mousefilename = args.B
    samplenameprefix = args.prefix
    outputdir = args.output_dir
    intermdir = args.intermediate_dir
    disablesort = args.no_sort
    disambalgo = args.aligner
    supportedalgorithms = set(['tophat', 'hisat2', 'bwa', 'star'])

    # check existence of input BAM files
    if not (file_exists(humanfilename) and file_exists(mousefilename)):
        sys.stderr.write(
            "\nERROR in disambiguate.py: Two existing input BAM files "
            "must be specified as positional arguments\n")
        sys.exit(2)
    if len(samplenameprefix) < 1:
        humanprefix = path.basename(humanfilename.replace(".bam", ""))
        mouseprefix = path.basename(mousefilename.replace(".bam", ""))
    else:
        if samplenameprefix.endswith(".bam"):
            samplenameprefix = samplenameprefix[0:samplenameprefix.rfind(
                ".bam"
            )]  # the above if is not stricly necessary for this to work
        humanprefix = samplenameprefix
        mouseprefix = samplenameprefix
    samplenameprefix = None  # clear variable
    if disambalgo.lower() not in supportedalgorithms:
        print(disambalgo +
              " is not a supported disambiguation scheme at the moment.")
        sys.exit(2)

    if disablesort:
        humanfilenamesorted = humanfilename  # assumed to be sorted externally...
        mousefilenamesorted = mousefilename  # assumed to be sorted externally...
    else:
        if not path.isdir(intermdir):
            makedirs(intermdir)
        humanfilenamesorted = path.join(
            intermdir, humanprefix + ".speciesA.namesorted.bam")
        mousefilenamesorted = path.join(
            intermdir, mouseprefix + ".speciesB.namesorted.bam")
        if not path.isfile(humanfilenamesorted):
            pysam.sort("-n", "-m", "2000000000", "-o", humanfilenamesorted,
                       humanfilename)
        if not path.isfile(mousefilenamesorted):
            pysam.sort("-n", "-m", "2000000000", "-o", mousefilenamesorted,
                       mousefilename)

# read in human reads and form a dictionary
    myHumanFile = pysam.Samfile(humanfilenamesorted, "rb")
    myMouseFile = pysam.Samfile(mousefilenamesorted, "rb")
    if not path.isdir(outputdir):
        makedirs(outputdir)
    myHumanUniqueFile = pysam.Samfile(path.join(
        outputdir, humanprefix + ".disambiguatedSpeciesA.bam"),
                                      "wb",
                                      template=myHumanFile)
    myHumanAmbiguousFile = pysam.Samfile(path.join(
        outputdir, humanprefix + ".ambiguousSpeciesA.bam"),
                                         "wb",
                                         template=myHumanFile)
    myMouseUniqueFile = pysam.Samfile(path.join(
        outputdir, mouseprefix + ".disambiguatedSpeciesB.bam"),
                                      "wb",
                                      template=myMouseFile)
    myMouseAmbiguousFile = pysam.Samfile(path.join(
        outputdir, mouseprefix + ".ambiguousSpeciesB.bam"),
                                         "wb",
                                         template=myMouseFile)
    summaryFile = open(path.join(outputdir, humanprefix + '_summary.txt'), 'w')

    #initialise
    try:
        nexthumread = myHumanFile.next()
        nextmouread = myMouseFile.next()
    except StopIteration:
        print("No reads in one or either of the input files")
        sys.exit(2)

    EOFmouse = EOFhuman = False
    prevHumID = '-+=RANDOMSTRING=+-'
    prevMouID = '-+=RANDOMSTRING=+-'
    while not EOFmouse & EOFhuman:
        while not (nat_cmp(nexthumread.qname, nextmouread.qname) == 0):
            # check order between current human and mouse qname (find a point where they're identical, i.e. in sync)
            while nat_cmp(
                    nexthumread.qname, nextmouread.qname
            ) > 0 and not EOFmouse:  # mouse is "behind" human, output to mouse disambiguous
                myMouseUniqueFile.write(nextmouread)
                if not nextmouread.qname == prevMouID:
                    nummou += 1  # increment mouse counter for unique only
                prevMouID = nextmouread.qname
                try:
                    nextmouread = myMouseFile.next()
                except StopIteration:
                    EOFmouse = True
            while nat_cmp(
                    nexthumread.qname, nextmouread.qname
            ) < 0 and not EOFhuman:  # human is "behind" mouse, output to human disambiguous
                myHumanUniqueFile.write(nexthumread)
                if not nexthumread.qname == prevHumID:
                    numhum += 1  # increment human counter for unique only
                prevHumID = nexthumread.qname
                try:
                    nexthumread = myHumanFile.next()
                except StopIteration:
                    EOFhuman = True
            if EOFhuman or EOFmouse:
                break
        # at this point the read qnames are identical and/or we've reached EOF
        humlist = list()
        moulist = list()
        if nat_cmp(nexthumread.qname, nextmouread.qname) == 0:
            humlist.append(nexthumread)
            nexthumread = read_next_reads(
                myHumanFile, humlist
            )  # read more reads with same qname (the function modifies humlist directly)
            if nexthumread == None:
                EOFhuman = True
            moulist.append(nextmouread)
            nextmouread = read_next_reads(
                myMouseFile, moulist
            )  # read more reads with same qname (the function modifies moulist directly)
            if nextmouread == None:
                EOFmouse = True

        # perform comparison to check mouse, human or ambiguous
        if len(moulist) > 0 and len(humlist) > 0:
            myAmbiguousness = disambiguate(humlist, moulist, disambalgo)
            if myAmbiguousness < 0:  # mouse
                nummou += 1  # increment mouse counter
                for myRead in moulist:
                    myMouseUniqueFile.write(myRead)
            elif myAmbiguousness > 0:  # human
                numhum += 1  # increment human counter
                for myRead in humlist:
                    myHumanUniqueFile.write(myRead)
            else:  # ambiguous
                numamb += 1  # increment ambiguous counter
                for myRead in moulist:
                    myMouseAmbiguousFile.write(myRead)
                for myRead in humlist:
                    myHumanAmbiguousFile.write(myRead)
        if EOFhuman:
            #flush the rest of the mouse reads
            while not EOFmouse:
                myMouseUniqueFile.write(nextmouread)
                if not nextmouread.qname == prevMouID:
                    nummou += 1  # increment mouse counter for unique only
                prevMouID = nextmouread.qname
                try:
                    nextmouread = myMouseFile.next()
                except StopIteration:
                    #print("3")
                    EOFmouse = True
        if EOFmouse:
            #flush the rest of the human reads
            while not EOFhuman:
                myHumanUniqueFile.write(nexthumread)
                if not nexthumread.qname == prevHumID:
                    numhum += 1  # increment human counter for unique only
                prevHumID = nexthumread.qname
                try:
                    nexthumread = myHumanFile.next()
                except StopIteration:
                    EOFhuman = True

    summaryFile.write(
        "sample\tunique species A pairs\tunique species B pairs\tambiguous pairs\n"
    )
    summaryFile.write(humanprefix + "\t" + str(numhum) + "\t" + str(nummou) +
                      "\t" + str(numamb) + "\n")
    summaryFile.close()
    myHumanFile.close()
    myMouseFile.close()
    myHumanUniqueFile.close()
    myHumanAmbiguousFile.close()
    myMouseUniqueFile.close()
    myMouseAmbiguousFile.close()
Esempio n. 19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--filename",
                      dest="filename",
                      type="string",
                      help="bamfile")

    parser.add_option("-a",
                      "--aligner",
                      dest="aligner",
                      type="string",
                      help="bamfile",
                      default="bwa")

    parser.add_option("-r",
                      "--output-report",
                      type="string",
                      dest="report",
                      help="bamfile",
                      default="")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="bamfile",
                      default="")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # Check the aligner is supported
    if options.aligner != "bwa":
        raise ValueError(
            "Currently only bwa is supported as aligner specific flags are used"
        )

    # Check that either a report or outfile name has been specified
    if options.report == "" and options.outfile == "":
        raise ValueError("Nothing to do")

    # Analyse the bamfile
    samfile = pysam.Samfile(options.filename, "rb")
    uniq_map, best_map, uORb_map = {}, {}, {}
    properly_paired = 0

    for read in samfile.fetch():

        if read.is_proper_pair:
            tagd = dict(read.tags)
            u, b, key = False, False, read.qname

            if tagd["XT"] == "U":
                u = True
                uniq_map[key] = 1

            if "X0" in tagd:
                if tagd["X0"] == 1:
                    b = True
                    best_map[key] = 1

            if u is True or b is True:
                uORb_map[key] = 1

            properly_paired += 1

    samfile.close()

    npp = properly_paired / 2

    E.info("No proper pairs: %s" % npp)

    # Write a tabular report if report name given
    if options.report != "":

        E.info("Writing report on no. proper pairs with unique/best reads")

        def _row(x, npp=npp):
            name, d = x
            n = len(d.keys())
            pc = float(n) / npp * 100
            line = "%s\t%i\t%.2f" % (name, n, pc)
            return (line)

        header = "\t".join(
            ["pair_criteria", "n_proper_pairs", "percent_proper_pairs"])

        with open(options.report, "w") as report:
            report.write(header + "\n")
            for x in [("unique", uniq_map), ("best", best_map),
                      ("unique_or_best", uORb_map)]:
                report.write(_row(x) + "\n")

    # Create new bam containing uniquely mapping read pairs
    # if outfile specified
    if options.outfile != "":

        E.info("Writing proper pairs with unique or best read to %s" %
               options.outfile)

        samfile = pysam.Samfile(options.filename, "rb")
        outbam = pysam.Samfile(options.outfile, "wb", template=samfile)

        for read in samfile.fetch():
            if read.is_proper_pair:
                if read.qname in uORb_map:
                    outbam.write(read)
        samfile.close()
        outbam.close()
Esempio n. 20
0
#parse fasta input of valid clusters with min_size = 2!
cluster_ids = set()
total_lines = 0
for cluster in clusters:
    if cluster.startswith('C'):
        split_line = cluster.split('\t')
        if int(split_line[2]) > 1:
            cluster_ids.add(split_line[1])
    else:
        total_lines += 1
clusters.seek(0)
print len(cluster_ids)

header = create_SAM_header(clusters)
outsam = pysam.Samfile(outsam, 'wh', header=header)
n = 0
cluster_list = seq_in.keys()

for line in clusters:
    #clusters contains an ordered list of sequences which should be processed
    #Forward reads /1 or merged fastq reads always represent crick reads.
    #These have number below CRICK_MAX
    if not n % 1000 and n:
        print "processed %s out of %s lines" % (n, total_lines)
#        make_ref(cluster_records)
    if line.startswith('S') or line.startswith('H'):
        n += 1
    if line.split('\t')[1] not in cluster_ids:
        continue
    cluster_instance = Cluster_obj(line)
Esempio n. 21
0
def crossmap_bam_file(mapping,
                      chainfile,
                      infile,
                      outfile_prefix,
                      chrom_size,
                      IS_size=200,
                      IS_std=30.0,
                      fold=3,
                      addtag=True):
    '''

	Description
	-----------
	Convert genome coordinates (in BAM/SAM format) between assemblies.
	BAM/SAM format: http://samtools.sourceforge.net/
	chrom_size is target chromosome size

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	chainfile : file
		Input chain format file.

	infile : file
		Input BAM, SAM or CRAM foramt file.

	outfile_prefix : str
		Output prefix.

	chrom_size : dict
		Chromosome size of the *target* assembly, used to build bam header.

	IS_size : int
		Average insert size of pair-end sequencing.

	IS_std : float
		Stanadard deviation of insert size.

	fold : float
		A mapped pair is considered as \"proper pair\" if both ends mapped to
		different strand and the distance between them is less then fold * stdev
		from the mean.

	addtag : bool
		if addtag is set to True, will add tags to each alignmnet:
			Q = QC (QC failed)
			N = unmapped (originally unmapped or originally mapped but failed
			    to liftover to new assembly)
			M = multiple mapped (alignment can be liftover to multiple places)
			U = unique mapped (alignment can be liftover to only 1 place)

		tags for pair-end sequencing include:
			QF: QC failed
			NN: both read1 and read2 unmapped
			NU: read1 unmapped, read2 unique mapped
			NM: read1 unmapped, multiple mapped
			UN: read1 uniquely mapped, read2 unmap
			UU: both read1 and read2 uniquely mapped
			UM: read1 uniquely mapped, read2 multiple mapped
			MN: read1 multiple mapped, read2 unmapped
			MU: read1 multiple mapped, read2 unique mapped
			MM: both read1 and read2 multiple mapped

		tags for single-end sequencing include:
			QF: QC failed
			SN: unmaped
			SM: multiple mapped
			SU: uniquely mapped
	'''

    # determine the input file format (BAM, CRAM or SAM)
    file_type = ''
    if infile.lower().endswith('.bam'):
        file_type = 'BAM'
        comments = ['ORIGINAL_BAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rb')
        if len(samfile.header) == 0:
            print("BAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.cram'):
        file_type = 'CRAM'
        comments = ['ORIGINAL_CRAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rc')
        if len(samfile.header) == 0:
            print("CRAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.sam'):
        file_type = 'SAM'
        comments = ['ORIGINAL_SAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'r')
        if len(samfile.header) == 0:
            print("SAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    else:
        print(
            "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
            file=sys.stderr)
        sys.exit(1)
    comments.append('CHAIN_FILE=' + chainfile)

    sam_ori_header = samfile.header.to_dict()

    # chromosome ID style of the original BAM file
    chrom_style = sam_ori_header['SQ'][0]['SN']  # either 'chr1' or '1'

    # update chrom_size of target genome
    target_chrom_sizes = {}
    for n, l in chrom_size.items():
        target_chrom_sizes[update_chromID(chrom_style, n)] = l

    (new_header, name_to_id) = sam_header.bam_header_generator(
        orig_header=sam_ori_header,
        chrom_size=target_chrom_sizes,
        prog_name="CrossMap",
        prog_ver=__version__,
        format_ver=1.0,
        sort_type='coordinate',
        co=comments)

    # write to file
    if outfile_prefix is not None:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            logging.info("Liftover BAM file \"%s\" to \"%s\"" %
                         (infile, outfile_prefix + '.bam'))
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            logging.info("Liftover CRAM file \"%s\" to \"%s\"" %
                         (infile, outfile_prefix + '.bam'))
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.sam',
                                     "wh",
                                     header=new_header)
            logging.info("Liftover SAM file \"%s\" to \"%s\"" %
                         (infile, outfile_prefix + '.sam'))
        else:
            logging.error(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'."
            )
            sys.exit(1)
    # write to screen
    else:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            logging.info("Liftover BAM file: %s" % infile)
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            logging.info("Liftover CRAM file: %s" % infile)
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile('-', "w", header=new_header)
            logging.info("Liftover SAM file: %s" % infile)
        else:
            logging.error(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'."
            )
            sys.exit(1)
    QF = 0
    NN = 0
    NU = 0
    NM = 0
    UN = 0
    UU = 0
    UM = 0
    MN = 0
    MU = 0
    MM = 0
    SN = 0
    SM = 0
    SU = 0
    total_item = 0
    try:
        while (1):
            total_item += 1
            old_alignment = next(samfile)
            new_alignment = pysam.AlignedRead()  # create AlignedRead object

            new_alignment.query_name = old_alignment.query_name  # 1st column. read name.
            new_alignment.query_sequence = old_alignment.query_sequence  # 10th column. read sequence. all bases.
            new_alignment.query_qualities = old_alignment.query_qualities  # 11th column. read sequence quality. all bases.
            new_alignment.set_tags(old_alignment.get_tags())  # 12 - columns

            # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
            # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution.
            try:
                rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
            except KeyError:
                pass
            else:
                new_alignment.set_tag("RG", str(rg), rgt)

            ## Pair-end sequencing
            if old_alignment.is_paired:
                new_alignment.flag = 0x1  #pair-end in sequencing
                if old_alignment.is_read1:
                    new_alignment.flag = new_alignment.flag | 0x40
                elif old_alignment.is_read2:
                    new_alignment.flag = new_alignment.flag | 0x80

                if old_alignment.is_qcfail:
                    new_alignment.flag = new_alignment.flag | 0x200
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6
                    new_alignment.next_reference_id = -1  #7
                    new_alignment.next_reference_start = 0  #8
                    new_alignment.template_length = 0  #9

                    QF += 1
                    if addtag: new_alignment.set_tag(tag="QF", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                #==================================
                # R1 originally unmapped
                #==================================
                elif old_alignment.is_unmapped:
                    new_alignment.flag = new_alignment.flag | 0x4  #2
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6

                    # R1 & R2 originally unmapped
                    if old_alignment.mate_is_unmapped:
                        new_alignment.next_reference_id = -1  #7
                        new_alignment.next_reference_start = 0  #8
                        new_alignment.template_length = 0  #9

                        NN += 1
                        if addtag: new_alignment.set_tag(tag="NN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
                    # R1 unmap, R2 is mapped
                    else:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None

                        #------------------------------------
                        # R1 unmapped, R2 failed to liftover
                        #------------------------------------
                        if read2_maps is None:
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            NN += 1
                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 unique
                        #------------------------------------
                        elif len(read2_maps) == 2:
                            # 2-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 multiple
                        #------------------------------------
                        else:
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2-9
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                #==================================
                # R1 is originally mapped
                #==================================
                else:
                    try:
                        read1_chr = samfile.get_reference_name(
                            old_alignment.reference_id)
                        read1_strand = '-' if old_alignment.is_reverse else '+'
                        read1_start = old_alignment.reference_start
                        read1_end = old_alignment.reference_end
                        read1_maps = map_coordinates(mapping, read1_chr,
                                                     read1_start, read1_end,
                                                     read1_strand)
                    except:
                        read1_maps = None

                    if not old_alignment.mate_is_unmapped:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None
                    #------------------------------------
                    # R1 failed to liftover
                    #------------------------------------
                    if read1_maps is None:
                        # read2 is unmapped or failed to convertion
                        if old_alignment.mate_is_unmapped or (read2_maps is
                                                              None):
                            # col2 - col9
                            new_alignment.flag = new_alignment.flag | 0x4  #2
                            new_alignment.reference_id = -1  #3
                            new_alignment.reference_start = 0  #4
                            new_alignment.mapping_quality = 255  #5
                            new_alignment.cigartuples = old_alignment.cigartuples  #6
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            NN += 1
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is multiple mapped
                        else:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = 255  # mapq not available
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                    #------------------------------------
                    # R1 uniquely mapped
                    #------------------------------------
                    elif len(read1_maps) == 2:
                        # col2 - col5
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        new_alignment.reference_id = name_to_id[read1_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read1_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # R2 unmapped before or after conversion
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UN += 1
                            if addtag: new_alignment.set_tag(tag="UN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = abs(
                                new_alignment.reference_start -
                                new_alignment.next_reference_start
                            ) + old_alignment.reference_length
                            # 2
                            if (read2_maps[1][3] != read1_maps[1][3]) and (
                                    new_alignment.template_length <=
                                    IS_size + fold * IS_std) and (
                                        new_alignment.template_length >=
                                        IS_size - fold * IS_std):
                                new_alignment.flag = new_alignment.flag | 0x2

                            UU += 1
                            if addtag: new_alignment.set_tag(tag="UU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is multiple mapped
                        else:
                            # 2 (strand)
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100

                            #7-9
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UM += 1
                            if addtag: new_alignment.set_tag(tag="UM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                    #------------------------------------
                    # R1 multiple mapped
                    #-----------------------------------
                    elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0:
                        # 2
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        # 3-5
                        new_alignment.tid = name_to_id[read1_maps[1]
                                                       [0]]  #chrom
                        new_alignment.pos = read1_maps[1][1]  #start
                        new_alignment.mapq = 255

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # (1) R2 is unmapped
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MN += 1
                            if addtag: new_alignment.set_tag(tag="MN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (2) read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MU += 1
                            if addtag: new_alignment.set_tag(tag="MU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (3) R2 is multiple mapped
                        else:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MM += 1
                            if addtag: new_alignment.set_tag(tag="MM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

            # Singel end sequencing
            else:
                # 7-9
                new_alignment.next_reference_id = -1
                new_alignment.next_reference_start = 0
                new_alignment.template_length = 0

                # (1) originally unmapped
                if old_alignment.is_unmapped:
                    # 2-6
                    new_alignment.flag = new_alignment.flag | 0x4
                    new_alignment.reference_id = -1
                    new_alignment.reference_start = 0
                    new_alignment.mapping_quality = 255
                    new_alignment.cigartuples = old_alignment.cigartuples

                    SN += 1
                    if addtag: new_alignment.set_tag(tag="SN", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                else:
                    new_alignment.flag = 0x0
                    read_chr = samfile.get_reference_name(
                        old_alignment.reference_id)
                    read_strand = '-' if old_alignment.is_reverse else '+'
                    read_start = old_alignment.reference_start
                    read_end = old_alignment.reference_end
                    read_maps = map_coordinates(mapping, read_chr, read_start,
                                                read_end, read_strand)

                    # (2) unmapped afte liftover
                    if read_maps is None:
                        new_alignment.flag = new_alignment.flag | 0x4
                        new_alignment.reference_id = -1
                        new_alignment.reference_start = 0
                        new_alignment.mapping_quality = 255

                        SN += 1
                        if addtag: new_alignment.set_tag(tag="SN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (3) unique mapped
                    if len(read_maps) == 2:
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            try:
                                new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                              -1]  #reverse quality string
                            except:
                                new_alignment.query_qualities = []
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.reference_id = name_to_id[read_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        SU += 1
                        if addtag: new_alignment.set_tag(tag="SU", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (4) multiple mapped
                    if len(read_maps) > 2 and len(read_maps) % 2 == 0:
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.tid = name_to_id[read_maps[1][0]]
                        new_alignment.pos = read_maps[1][1]
                        new_alignment.mapq = old_alignment.mapq

                        SM += 1
                        if addtag: new_alignment.set_tag(tag="SM", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
    except StopIteration:
        logging.info("Done!")
    OUT_FILE.close()

    if outfile_prefix is not None:
        if file_type == "BAM" or file_type == "CRAM":
            try:
                logging.info(
                    'Sort "%s" and save as "%s"' %
                    (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam'))
                pysam.sort("-o", outfile_prefix + '.sorted.bam',
                           outfile_prefix + '.bam')
            except:
                logging.warning("output BAM file was NOT sorted")
            try:
                logging.info('Index "%s" ...' %
                             (outfile_prefix + '.sorted.bam'))
                pysam.index(outfile_prefix + '.sorted.bam',
                            outfile_prefix + '.sorted.bam.bai')
            except:
                logging.warning("output BAM file was NOT indexed.")

    print("\nTotal alignments:" + str(total_item - 1))
    print("\tQC failed: " + str(QF))
    if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0:
        print("\tPaired-end reads:")
        print("\t\tR1 unique, R2 unique (UU): " + str(UU))
        print("\t\tR1 unique, R2 unmapp (UN): " + str(UN))
        print("\t\tR1 unique, R2 multiple (UM): " + str(UM))

        print("\t\tR1 multiple, R2 multiple (MM): " + str(MM))
        print("\t\tR1 multiple, R2 unique (MU): " + str(MU))
        print("\t\tR1 multiple, R2 unmapped (MN): " + str(MN))

        print("\t\tR1 unmap, R2 unmap (NN): " + str(NN))
        print("\t\tR1 unmap, R2 unique (NU): " + str(NU))
        print("\t\tR1 unmap, R2 multiple (NM): " + str(NM))
    if max(SN, SU, SM) > 0:
        print("\tSingle-end reads:")
        print("\t\tUniquley mapped (SU): " + str(SU))
        print("\t\tMultiple mapped (SM): " + str(SM))
        print("\t\tUnmapped (SN): " + str(SN))
Esempio n. 22
0
import traceback
import os

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("-v", dest="verbose", action="store_true")
    parser.add_argument("unmapped_bam", nargs="+")
    parser.add_argument("fastq", help="e.g. test.fastq")

    args = parser.parse_args()

    try:
        with open(args.fastq, "w") as fastq_file:

            for bam_file in args.unmapped_bam:
                if os.path.exists(bam_file):
                    try:
                        samfile = pysam.Samfile(bam_file,
                                                mode="rb",
                                                check_header=False,
                                                check_sq=False)
                        for x in samfile.fetch(until_eof=True):
                            fastq_file.write("@%s\n%s\n+\n%s\n" %
                                             (x.qname, x.seq, x.qual))
                        samfile.close()
                    except Exception:
                        traceback.print_exc()
    except Exception:
        traceback.print_exc()
Esempio n. 23
0
if (args.filter is not None):
    with open(args.filter) as filterFile:
        filt = {l.strip(): True for l in filterFile}

printed = {}

for regionStr in args.region:
    region = Tools.FormatRegion(regionStr)

    if (region is None):
        print "malformatted region " + ' '.join(regionStr)
        sys.exit(1)

    region = (region[0], region[1] - args.slop, region[2] + args.slop)

    samFile = pysam.Samfile(args.bam)
    nAlns = samFile.count(region[0], region[1], region[2])
    if (nAlns <= args.max or args.subsample):
        if (nAlns > args.max and args.subsample):
            lengths = []
            index = 0
            tmpPrinted = {}
            for aln in samFile.fetch(region[0], region[1], region[2]):
                if (aln.mapq < args.minqv):
                    continue
                if (args.primary and aln.flag & 256 != 0):
                    continue
                if (aln.qname in tmpPrinted):
                    continue

                tmpPrinted[aln.qname] = True
def add_tag_to_bam(tmp_bam, out_bam, tag_map):
    sam_handle = pysam.Samfile(tmp_bam)
    out_handle = pysam.Samfile(out_bam, 'wb', template=sam_handle)
    for rec in sam_handle:
        rec.set_tag('BX', tag_map[rec.qname])
        out_handle.write(rec)
Esempio n. 25
0
Date : March 18, 2016

Author : Heather Landry

Remove reads in bam file resulting from PCR duplicates. This script records all barcodes and coordinates
at a specific position. For every bam line, if the barcode and coordinate has not been seen previously, it
will print; if the barcode and position has been seen previously, it will not print to a new file.

use : python removePCRdupsFromBAM.py    iBAM (input BAM file with only unique alignments) [1]
                                        oBAM (output BAM file containing only non duplicated reads) [2]
                                            
"""
import sys, pysam, os, numpy, re

iBAM = pysam.Samfile(sys.argv[1], 'rb')
oBAM = pysam.Samfile(sys.argv[2], 'wb', template=iBAM)

MB = set()

# read through starting bam file
for read in iBAM:
    mb = read.qname.split('_MolecularBarcode:')[1]
    chrom = iBAM.getrname(read.tid)
    
    # selecting the 3' position for pos strand 
    if read.is_reverse:
        start = read.aend
        std='pos'
    
    # selecting the 3' position for neg strand 
Esempio n. 26
0
    def do_local_assembly(self, root_ctg, asmrootdir_path):

        self.logger.log(
            'assembling barcoded reads for seed {}'.format(root_ctg))

        ctg_size_map = util.get_fasta_sizes(self.options.ctgfasta_path)

        # check enough read/barcode coverage to warrant denovo assembly
        numreads = 0
        bcodes = set()
        fhandle = pysam.Samfile(self.options.reads_ctg_bam_path, 'rb')
        for read in fhandle.fetch(root_ctg):
            if read.is_unmapped or read.mapq < 10:
                continue
            numreads += 1
            bcode = util.get_barcode(read)
            if bcode != None:
                bcodes.add(bcode)
        fhandle.close()
        size = ctg_size_map[root_ctg]
        cov = 95. * numreads / size
        if cov < 10. or len(bcodes) < 30.:
            self.logger.log(
                'seed {} contig does not have high enough coverage'.format(
                    root_ctg))
            self.logger.log('  - {} bcodes, {}x'.format(len(bcodes), cov))
            out_path = os.path.join(asmrootdir_path, 'local-asm-merged.fa')
            util.touch(out_path)
            return

        # create a local assembler for this root contig
        asm = LocalAssembler(
            root_ctg,
            self.options.ctgfasta_path,
            self.options.reads_ctg_bam_path,
            self.options.input_fqs,
            asmrootdir_path,
            self.logger,
        )

        self.logger.log('determing local assemblies')
        local_asms = asm.gen_local_cands()
        self.logger.log('  - found {} candidates'.format(len(local_asms)))

        # do not locally assemble with other seeds that are lexicographcially
        # smaller than the root. these will get run in other bins
        seed_ctgs = set(
            filter(
                lambda (c):
                (c > root_ctg and ctg_size_map[c] >= MIN_SEED_SIZE),
                ctg_size_map.keys(),
            ))

        self.logger.log('performing local assemblies')
        local_asm_results = asm.assemble(local_asms, filt_ctgs=seed_ctgs)
        self.logger.log('  - finished {}'.format(len(local_asms)))

        # merge output contigs from local assemblies
        self.logger.log('merge long output contigs from local assemblies')
        mergedasm_path = os.path.join(asmrootdir_path, 'local-asm-merged.fa')
        total_asm_contigs = 0
        total_asm_bp = 0
        with open(mergedasm_path, 'w') as fout:
            for i, (local_asm, contig_path) in enumerate(local_asm_results):
                if contig_path == None:
                    self.logger.log(
                        'contig path for local asm {} not generated'.format(
                            str(local_asm)))
                    continue
                fasta = pysam.FastaFile(contig_path)
                for contig in sorted(
                        fasta.references,
                        key=lambda (c): fasta.get_reference_length(c),
                        reverse=True,
                ):
                    seq = str(fasta.fetch(contig).upper())
                    if len(seq) < 2000:
                        break
                    total_asm_contigs += 1
                    total_asm_bp += len(seq)
                    link_name = local_asm.link_ctg
                    if link_name == None:
                        link_name = 'seed'
                    fout.write('>{}.{}${}.{}\n'.format(local_asm.root_ctg,
                                                       link_name, contig, i))
                    fout.write(str(seq) + '\n')

        self.logger.log('  - {} contigs covering {} bases'.format(
            total_asm_contigs, total_asm_bp))
        pass_path = os.path.join(asmrootdir_path, 'pass')
        util.touch(pass_path)
Esempio n. 27
0
def processReads(samfile_path, exonTrees, repeatTrees, chimericBedFile):


	print("Processing all reads",file=sys.stderr)
	sys.stderr.flush()

	# Input Bam File
	samfile = pysam.Samfile( samfile_path, "rb" )
	
	readIterator = samfile.fetch()
	
	# Horrible code to extract total number of reads in BAM file
	#readCount = sum([ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(samfile_path) ])
	
	#print("	" + str(readCount) + " reads in BAM file",file=sys.stderr)
	
	localResults = {}
	count = 0
	nextPerc = 5.0
	
	# Go through every read in the bam file
	for read in readIterator:
		
		# Go through all read pairs that are:
			# Reads are paired
			# Pair on same chromosome
			# map quailty greater then zero (not multi-mapping)
			# [ add check to see read pair map quality as well ?]	
		#if (read.is_proper_pair and read.is_read1 and read.tid==read.mrnm):
		if (read.is_read1 and read.is_paired and read.tid==read.rnext and int(read.mapq)>0):	
			# Get chromosome for reads
			chr = samfile.getrname(read.tid)
			
			# Parse chromosome name to remove chr ('chr3' --> '3')
			# to remove chr (chr3 --> 3)
			chr = chr.replace("chr", "")

			# Skip reads not on chr 1-22,X,Y
			
			valid = list(range(1,23)) # chr1 - chr 22
			valid.append('X') # chrX
			#valid.append('Y') # chrY
		
			if (chr not in str(valid)):
			# if read is not canonon chromosome skip
				continue

			# Get start coordinates for both reads
			start1 = read.pos
			start2 = read.mpos
			
			# [ Artem - Try working with spliced reads ]
			
			# Get end coordinates for both reads
			# [ Artem - Check if aligned length is used ]
			end1 = start1 + read.rlen
			end2 = start2 + read.rlen
			
			 
			# At the moment, it will fault here if there is no exon
			# information for the chromosome
			
			# Exon results is a list of exons (rows in the exon file) that
			# intersect with read1/2
			exon_results1 = exonTrees[str(chr)].findRange([start1,end1])
			exon_results2 = exonTrees[str(chr)].findRange([start2,end2])
			
			# Repeat results is a list of repeats (rows in the repeat file) that
			# intersect with read1/2
			repeat_results1 = repeatTrees[str(chr)].findRange([start1,end1])
			repeat_results2 = repeatTrees[str(chr)].findRange([start2,end2])
			
			# Get TRUE/FALSE if reads 1/2 intersect with exons or repeats
			e1 = (len(exon_results1) > 0)
			e2 = (len(exon_results2) > 0)
			r1 = (len(repeat_results1) > 0)
			r2 = (len(repeat_results2) > 0)
			
			# Classify read1 as D/E/R/.
			if (e1 and r1):
				type1 = "D"
			elif (e1):
				type1 = "E"
			elif (r1):
				type1 = "R"
			else:
				type1 = "."
				
			# Classify read2 as D/E/R/.
			if (e2 and r2):
				type2 = "D"
			elif (e2):
				type2 = "E"
			elif (r2):
				type2 = "R"
			else:
				type2 = "."
			
			# Sort (so "RE" becomes "ER")
			type = "".join(sorted(type1 + type2))
			
			# Is Read Chimeric?
			if chimericBedFile != 0 and ((e1 and r2) or (r1 and e2)):
				# Is Chy
				feature_start = min(start1, start2)
				feature_end = max(end1, end2)
				gap = abs(start1-start2)
				line = str(chr) + "\t" + str(feature_start) + "\t" + str(feature_end) + "\tchimericread\t960\t.\t" + str(feature_start) + "\t" + str(feature_end) + "\t0,0,250\t2\t" + str(read.rlen) + "," + str(read.rlen) + "\t0," + str(gap)
				chimericBedFile.write(line + "\n")
			
			# Zips up exon / repeat IDs
			pairs1 = list(zip(exon_results1, repeat_results2))
			pairs2 = list(zip(exon_results2, repeat_results1))
			pairs = pairs1 + pairs2
			
			# Use these (exonID,repeatID) pairs as key to dictionary
			# and store read type in dictionary
			for p in pairs:				
				result = localResults.get(p, [])
				result.append(type)
				localResults[p] = result
				
		count = count + 1
		#perc = round((count/float(readCount))*100.0,1)
		
		# Print status to standard output
		#if (perc >= nextPerc):
		#	print("	 " + str(perc) + "% (" + str(datetime.time(datetime.now())) + ")",file=sys.stderr)
		#	nextPerc = nextPerc + 5.0
		
	
	return localResults
Esempio n. 28
0
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input",
        action="store",
        type="string",
        dest="input_files",
        help=
        'Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]'
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="ref_gene_model",
        help=
        "Reference gene model in BED format. Must be strandard 12-column BED file. [required]"
    )
    parser.add_option(
        "-c",
        "--minCov",
        action="store",
        type="int",
        dest="minimum_coverage",
        default=10,
        help="Minimum number of read mapped to a transcript. default=%default")
    parser.add_option(
        "-n",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=100,
        help=
        "Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default"
    )
    parser.add_option(
        "-s",
        "--subtract-background",
        action="store_true",
        dest="subtract_bg",
        help=
        "Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads."
    )
    (options, args) = parser.parse_args()

    # if '-s' was set
    if options.subtract_bg:
        exon_ranges = union_exons(options.ref_gene_model)

    if options.sample_size < 0:
        print >> sys.stderr, "Number of nucleotide can't be negative"
        sys.exit(0)
    elif options.sample_size > 1000:
        print >> sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow."

    if not (options.input_files and options.ref_gene_model):
        parser.print_help()
        sys.exit(0)

    if not os.path.exists(options.ref_gene_model):
        print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
        parser.print_help()
        sys.exit(0)

    printlog("Get BAM file(s) ...")
    bamfiles = sorted(getBamFiles.get_bam_files(options.input_files))

    if len(bamfiles) <= 0:
        print >> sys.stderr, "No BAM file found, exit."
        sys.exit(0)
    else:
        print >> sys.stderr, "Total %d BAM file(s):" % len(bamfiles)
        for f in bamfiles:
            print >> sys.stderr, "\t" + f

    for f in bamfiles:
        printlog("Processing " + f)

        SUM = open(os.path.basename(f).replace('bam', '') + 'summary.txt', 'w')
        print >> SUM, "\t".join(
            ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)'])

        OUT = open(os.path.basename(f).replace('bam', '') + 'tin.xls', 'w')
        print >> OUT, "\t".join(
            ["geneID", "chrom", "tx_start", "tx_end", "TIN"])

        samfile = pysam.Samfile(f, "rb")
        sample_TINs = []  #sample level TIN, values are from different genes
        finish = 0
        noise_level = 0.0
        for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions(
                refbed=options.ref_gene_model,
                sample_size=options.sample_size):
            finish += 1

            # check minimum reads coverage
            if check_min_reads(samfile, i_chr, i_tx_start, i_tx_end,
                               options.minimum_coverage) is not True:
                print >> OUT, '\t'.join([
                    str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, 0.0)
                ])
                continue

            # estimate background noise if '-s' was specified
            if options.subtract_bg:
                intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end,
                                                   samfile, exon_ranges)
                if intron_size > 0:
                    noise_level = intron_signals / intron_size

            coverage = genebody_coverage(samfile, i_chr,
                                         sorted(pick_positions), noise_level)

            #for a,b in zip(sorted(pick_positions),coverage):
            #	print str(a) + '\t' + str(b)

            tin1 = tin_score(cvg=coverage, l=len(pick_positions))
            sample_TINs.append(tin1)
            print >> OUT, '\t'.join(
                [str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, tin1)])
            print >> sys.stderr, " %d transcripts finished\r" % (finish),

        print >> SUM, "\t".join([
            str(i) for i in (os.path.basename(f), mean(sample_TINs),
                             median(sample_TINs), std(sample_TINs))
        ])
        OUT.close()
        SUM.close()
        samfile.close()
Esempio n. 29
0
def main(args, outs):
    def read_clusters(it):
        while True:
            head = str(it.next().strip())
            seq1 = str(it.next().strip())
            trim1 = seq1[:args.trim_length]
            seq1 = seq1[args.trim_length:]
            q1 = str(it.next().strip())
            trim_q1 = q1[:args.trim_length]
            q1 = q1[args.trim_length:]
            seq2 = str(it.next().strip())
            q2 = str(it.next().strip())
            bc = str(it.next().strip())
            rx = bc.split(',')
            bc = rx[0]
            if len(rx) > 1:
                rx = rx[1]
            else:
                rx = rx[0]
                bc = None
            bcq = str(it.next().strip())

            si = str(it.next().strip())
            siq = str(it.next().strip())

            lines = [
                head, seq1, q1, seq2, q2, rx, bc, bcq, si, siq, trim1, trim_q1
            ]
            yield (bc, lines)

    try:
        version = martian.get_pipelines_version()
    except NameError:
        version = 'unknown'

    def make_rg_header(packed_rg_string):
        '''Make the RG header, matching how it's done in Lariat.'''
        result = packed_rg_string.split(':')
        if len(result) != 5:
            raise Exception(
                "RG string must have this format - sample_id:library_id:gem_group:flowcell:lane"
            )
        sample_id, library_id, gem_group, flowcell, lane = result
        return {
            'ID': packed_rg_string,
            'SM': sample_id,
            'LB': library_id,
            'PU': gem_group,
            'PL': 'ILLUMINA'
        }
        #return '@RG\\tID:{0}\\tSM:{1}\\tLB:{2}.{3}\\tPU:{0}\\tPL:ILLUMINA'.format(packed_rg_string, sample_id, library_id, gem_group)

    header = {
        'HD': {
            'VN': '1.3',
            'SO': 'unknown'
        },
        'RG': [make_rg_header(rg_string) for rg_string in args.read_groups],
        'PG': [{
            'ID': 'make_unaligned_bam',
            'PN': '10X longranger/make_unaligned_bam',
            'VN': version
        }],
        'CO': [
            '10x_bam_to_fastq:R1(RX:QX,TR:TQ,SEQ:QUAL)',
            '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)'
        ]
    }

    if args.output_format == "bam":
        out_bam = pysam.Samfile(outs.barcoded_unaligned,
                                mode='wb',
                                header=header)
        out_fastq = None

    elif args.output_format == "fastq":
        out_fastq = open(outs.barcoded, 'w')
        out_bam = None

    else:
        martian.exit("MAKE_UNALIGNED_OUPUT: invalid output format: '%s'" %
                     args.output_format)

    def wfq(head, seq, qual):
        out_fastq.write(head)
        out_fastq.write("\n")
        out_fastq.write(seq)
        out_fastq.write("\n+\n")
        out_fastq.write(qual)
        out_fastq.write("\n")

    # Open FASTQ input chunk
    proc = subprocess.Popen(["gunzip", "--stdout", args.read_chunk],
                            stdout=subprocess.PIPE)
    reader = proc.stdout

    num_pairs = 0
    correct_bc_pairs = 0

    for (bc, fields) in read_clusters(reader):

        (head, seq1, q1, seq2, q2, rx, bc, bcq, si, siq, trim,
         trim_qual) = fields
        head_parts = head.split(" ")
        qname = head_parts[0]
        rg = head_parts[-1]

        tags1 = [('RG', str(rg)), (SAMPLE_INDEX_TAG, si),
                 (SAMPLE_INDEX_QUAL_TAG, siq)]
        tags2 = [('RG', str(rg)), (SAMPLE_INDEX_TAG, si),
                 (SAMPLE_INDEX_QUAL_TAG, siq)]

        if len(trim) > 0:
            tags1.append((TRIM_TAG, str(trim)))
            tags1.append((TRIM_QUAL_TAG, str(trim_qual)))

        num_pairs += 1

        if bc:
            tags1.append((PROCESSED_BARCODE_TAG, bc))
            tags2.append((PROCESSED_BARCODE_TAG, bc))
            correct_bc_pairs += 1
        tags1.append((RAW_BARCODE_TAG, rx))
        tags1.append((RAW_BARCODE_QUAL_TAG, bcq))
        tags2.append((RAW_BARCODE_TAG, rx))
        tags2.append((RAW_BARCODE_QUAL_TAG, bcq))

        if out_bam is not None:
            # Read 1
            a = pysam.AlignedRead()
            a.qname = qname
            a.seq = seq1
            a.qual = q1

            # Unmapped R1
            a.is_unmapped = True
            a.is_read1 = True
            a.tid = -1
            a.pos = -1
            a.mapq = 0
            a.cigar = [(4, len(seq1))]

            a.mrnm = -1
            a.mpos = -1
            a.tlen = -1

            a.tags = tags1

            # Read 2
            b = pysam.AlignedRead()
            b.qname = qname
            b.seq = seq2
            b.qual = q2

            b.is_unmapped = True
            b.is_read2 = True
            b.tid = -1
            b.pos = -1
            b.mapq = 0
            b.cigar = [(4, len(seq2))]

            b.mrnm = -1
            b.mpos = -1
            b.tlen = -1

            b.tags = tags2

            out_bam.write(a)
            out_bam.write(b)

        if out_fastq is not None:
            header = qname
            if bc:
                bc_header = "%s:Z:%s" % (PROCESSED_BARCODE_TAG, bc)
                header = header + " " + bc_header

            wfq(header, seq1, q1)
            wfq(header, seq2, q2)

    if out_bam is not None:
        out_bam.close()

    if out_fastq is not None:
        out_fastq.close()

    outs.num_pairs = num_pairs
    outs.correct_bc_pairs = correct_bc_pairs
Esempio n. 30
0
def main():

    if len(sys.argv) < 5:
        print(
            'usage: python %s bedfilename chrField BAMfilename chrom.sizes outputfilename [-nomulti] [-RPM] [-stranded +|-] [-readLength min max] [-printSum] [-uniqueBAM] [-mappabilityNormalize mappability.wig readLength] [-noNH samtools]'
            % sys.argv[0])
        print('Note: the script will divide multireads by their multiplicity')
        print('\t-printSum option only working together with the RPM option')
        print(
            '\tuse the uniqueBAM option if the BAM file contains only unique alignments; this will save a lot of memory'
        )
        print(
            '\tuse the -mappabilityNormalize option to get mappability normalized RPKMs (it will not do anything to the RPMs; not that a mappability track that goes from 0 to the read length is assumed'
        )
        print(
            '\tuse the -noNH option and supply a path to samtools in order to have the file converted to one that has NH tags'
        )
        print(
            '\tthe stranded option will normalized against all reads, not just reads on the indicated strand'
        )
        sys.exit(1)

    bed = sys.argv[1]
    fieldID = int(sys.argv[2])
    SAM = sys.argv[3]
    chromSize = sys.argv[4]
    outfilename = sys.argv[5]

    chromInfoList = []
    linelist = open(chromSize)
    for line in linelist:
        fields = line.strip().split('\t')
        chr = fields[0]
        start = 0
        end = int(fields[1])
        chromInfoList.append((chr, start, end))

    noMulti = False
    if '-nomulti' in sys.argv:
        noMulti = True
        print('will discard multi-read alignments')

    doReadLength = False
    if '-readLength' in sys.argv:
        doReadLength = True
        minRL = int(sys.argv[sys.argv.index('-readLength') + 1])
        maxRL = int(sys.argv[sys.argv.index('-readLength') + 2])
        print('will only consider reads between', minRL, 'and', maxRL,
              'bp length')
        ORLL = 0

    doPrintSum = False

    doStranded = False
    if '-stranded' in sys.argv:
        doStranded = True
        thestrand = sys.argv[sys.argv.index('-stranded') + 1]
        print('will only consider', thestrand, 'strand reads')

    doRPM = False
    if '-RPM' in sys.argv:
        doRPM = True
        print('will output RPMs')
        if '-printSum' in sys.argv:
            doPrintSum = True
            RPMSum = 0

    doUniqueBAM = False
    if '-uniqueBAM' in sys.argv:
        print('will treat all alignments as unique')
        doUniqueBAM = True
        TotalReads = 0
        pass

    samfile = pysam.Samfile(SAM, "rb")
    try:
        print('testing for NH tags presence')
        for alignedread in samfile.fetch():
            multiplicity = alignedread.opt('NH')
            print('file has NH tags')
            break
    except:
        if '-noNH' in sys.argv:
            print(
                'no NH: tags in BAM file, will replace with a new BAM file with NH tags'
            )
            samtools = sys.argv[sys.argv.index('-noNH') + 1]
            BAMpreporcessingScript = sys.argv[0].rpartition(
                '/')[0] + '/bamPreprocessing.py'
            cmd = 'python ' + BAMpreporcessingScript + ' ' + SAM + ' ' + SAM + '.NH'
            os.system(cmd)
            cmd = 'rm ' + SAM
            os.system(cmd)
            cmd = 'mv ' + SAM + '.NH' + ' ' + SAM
            os.system(cmd)
            cmd = samtools + ' index ' + SAM
            os.system(cmd)
        else:
            if doUniqueBAM:
                pass
            else:
                print('no NH: tags in BAM file, exiting')
                sys.exit(1)

    doMappabilityCorrection = False
    if not doRPM and '-mappabilityNormalize' in sys.argv:
        doMappabilityCorrection = True
        print('will correct for mappability')
        mappability = sys.argv[sys.argv.index('-mappabilityNormalize') + 1]
        readLength = int(sys.argv[sys.argv.index('-mappabilityNormalize') + 2])
        WantedDict = {}
        MappabilityRegionDict = {}
        lineslist = open(bed)
        i = 0
        print('inputting regions')
        for line in lineslist:
            if line[0] == '#':
                continue
            i += 1
            if i % 1000 == 0:
                print(i, 'regions inputted')
            fields = line.strip().split('\t')
            if len(fields) < fieldID + 2:
                continue
            chr = fields[fieldID]
            try:
                left = int(fields[fieldID + 1])
                right = int(fields[fieldID + 2])
            except:
                print('problem with region, skipping:', line.strip())
            if left >= right:
                print('problem with region, skipping:', chr, left, right)
                continue
            if MappabilityRegionDict.has_key(chr):
                pass
            else:
                MappabilityRegionDict[chr] = {}
                WantedDict[chr] = {}
            MappabilityRegionDict[chr][(left, right)] = 0
            for j in range(left, right):
                WantedDict[chr][j] = 0
        lineslist = open(mappability)
        print('inputting mappability')
        i = 0
        for line in lineslist:
            if line.startswith('#'):
                continue
            i += 1
            if i % 1000000 == 0:
                print(str(i / 1000000) + 'M lines processed')
            fields = line.strip().split('\t')
            if len(fields) == 1:
                fields = line.strip().split(' ')
            chr = fields[0]
            left = int(fields[1])
            right = int(fields[2])
            score = float(fields[3])
            if WantedDict.has_key(chr):
                pass
            else:
                continue
            for j in range(left, right):
                if WantedDict[chr].has_key(j):
                    WantedDict[chr][j] = score
        print('calculating mappable fractions')
        for chr in MappabilityRegionDict.keys():
            for (left, right) in MappabilityRegionDict[chr].keys():
                TotalScore = 0.0
                for j in range(left, right):
                    TotalScore += WantedDict[chr][j]
                Score = TotalScore / (right - left)
                MappabilityRegionDict[chr][(left, right)] = Score / readLength
        WantedDict = {}

    regionDict = {}

    Unique = 0
    UniqueSplices = 0
    Multi = 0
    MultiSplices = 0

    if doUniqueBAM and not doReadLength:
        TotalReads = 0
        for chrStats in pysam.idxstats(SAM):
            fields = chrStats.strip().split('\t')
            chr = fields[0]
            reads = int(fields[2])
            if chr != '*':
                TotalReads += reads
        UniqueReads = TotalReads
    else:
        MultiplicityDict = {}
        UniqueReads = 0
        i = 0
        samfile = pysam.Samfile(SAM, "rb")
        for (chr, start, end) in chromInfoList:
            try:
                for alignedread in samfile.fetch(chr, start, end):
                    i += 1
                    if i % 5000000 == 0:
                        print(
                            str(i / 1000000) + 'M alignments processed', chr,
                            start, end)
                    fields = str(alignedread).split('\t')
                    if doReadLength:
                        if len(alignedread.seq) > maxRL or len(
                                alignedread.seq) < minRL:
                            ORLL += 1
                            continue
                    if doUniqueBAM:
                        TotalReads += 1
                        continue
                    if alignedread.opt('NH') == 1:
                        UniqueReads += 1
                        continue
                    ID = fields[0]
                    if alignedread.is_read1:
                        ID = ID + '/1'
                    if alignedread.is_read2:
                        ID = ID + '/2'
                    if MultiplicityDict.has_key(ID):
                        MultiplicityDict[ID] += 1
                    else:
                        MultiplicityDict[ID] = 1
            except:
                print('problem with region:', chr, start, end, 'skipping')
        if doReadLength:
            print(ORLL, 'alignments outside of read length limits')
        if doUniqueBAM:
            pass
        else:
            TotalReads = UniqueReads + len(MultiplicityDict.keys())

    print(TotalReads, UniqueReads)

    normalizeBy = TotalReads / 1000000.

    outfile = open(outfilename, 'w')

    lineslist = open(bed)
    i = 0
    for line in lineslist:
        i += 1
        if i % 10000 == 0:
            print(i, 'regions processed')
        if line[0] == '#':
            continue
        fields = line.strip().split('\t')
        if len(fields) < fieldID + 2:
            continue
        chr = fields[fieldID]
        try:
            left = int(fields[fieldID + 1])
            right = int(fields[fieldID + 2])
        except:
            print('problem with region, skipping:', line.strip())
        if left >= right:
            print('problem with region, skipping:', chr, left, right)
            continue
        reads = 0
        try:
            for alignedread in samfile.fetch(chr, left, right):
                fields2 = str(alignedread).split('\t')
                if doReadLength:
                    if len(alignedread.seq) > maxRL or len(
                            alignedread.seq) < minRL:
                        continue
                ID = fields2[0]
                if doStranded:
                    if alignedread.is_reverse:
                        s = '-'
                    else:
                        s = '+'
                    if s != thestrand:
                        continue
                if alignedread.is_read1:
                    ID = ID + '/1'
                if alignedread.is_read2:
                    ID = ID + '/2'
                if doUniqueBAM:
                    reads += 1
                else:
                    if noMulti and alignedread.opt('NH') > 1:
                        continue
                    reads += 1. / alignedread.opt('NH')
#                    print('NH, weight:', alignedread.opt('NH'), 1./alignedread.opt('NH'))
        except:
            print('problem with region:', chr, left, right,
                  'assigning 0 value')
            reads = 0
        if doRPM:
            score = reads / normalizeBy


#            print(chr, right - left, normalizeBy)
        else:
            try:
                score = reads / (((right - left) / 1000.) * normalizeBy)
            except:
                print('region of size 0, skipping:', line.strip())
                continue
        if doPrintSum:
            RPMSum += score
        outline = line.strip() + '\t' + str(score)
        if doMappabilityCorrection:
            outline = outline + '\t' + str(
                MappabilityRegionDict[chr][(left, right)])
            if MappabilityRegionDict[chr][(left, right)] == 0:
                outline = outline + '\t0'
            else:
                outline = outline + '\t' + str(
                    score / MappabilityRegionDict[chr][(left, right)])
        outfile.write(outline + '\n')

    if doPrintSum:
        outfile.write('#Total RPM:' + str(RPMSum) + '\n')

    outfile.close()