Esempio n. 1
0
def Get_Skipend_dict(region_fetch,bamfile,strand):
    bam_reader = HTSeq.BAM_Reader(bamfile)
    read_seq = bam_reader.fetch(region=region_fetch)
    read_seq_iter = iter(bam_reader.fetch())
    one_read = next(read_seq_iter)
    skip_list=[]
    pe_mode = one_read.paired_end
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    for a in read_seq:
        if not pe_mode:
            if not a.aligned:
                continue
            if a.optional_field('NH') > 1:
                continue
            if strand == "+":
                skip_list.extend([int(cigop.ref_iv.end) for cigop in a.cigar if cigop.type == "N" and cigop.size >0])
            else:
                skip_list.extend([int(cigop.ref_iv.start) for cigop in a.cigar if cigop.type == "N" and cigop.size >0])
        else:
            if ((a[0] and a[0].aQual<minaqual) or (a[1] and a[1].aQual<minaqual)):
                continue
            if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH')>1)):
                continue
            if a[0] is not None and a[0].aligned:
                if strand == "+":
                    skip_list.extend([int(cigop.ref_iv.end) for cigop in a[0].cigar if cigop.type =="N" and cigop.size > 0])
                else:
                    skip_list.extend([int(cigop.ref_iv.start) for cigop in a[0].cigar if cigop.type =="N" and cigop.size > 0])
            if a[1] is not None and a[1].aligned:
                if strand == "+":
                    skip_list.extend([int(cigop.ref_iv.end) for cigop in a[1].cigar if cigop.type =="N" and cigop.size > 0])
                else:
                    skip_list.extend([int(cigop.ref_iv.start) for cigop in a[1].cigar if cigop.type =="N" and cigop.size > 0])
    skip_dict = dict(collections.Counter(skip_list))
    return skip_dict
Esempio n. 2
0
def compute_quality(
        readfilename,
        file_type,
        nosplit,
        readlen,
        max_qual,
        gamma,
        primary_only=False,
        max_records=-1,
        ):

    if file_type in ("sam", "bam"):
        readfile = HTSeq.BAM_Reader(readfilename)
        isAlnmntFile = True
    elif file_type == "solexa-export":
        readfile = HTSeq.SolexaExportReader(readfilename)
        isAlnmntFile = True
    elif file_type == "fastq":
        readfile = HTSeq.FastqReader(readfilename)
        isAlnmntFile = False
    elif file_type == "solexa-fastq":
        readfile = HTSeq.FastqReader(readfilename, "solexa")
        isAlnmntFile = False
    else:
        raise ValueError('File format not recognized: {:}'.format(file_type))

    twoColumns = isAlnmntFile and (not nosplit)

    if readlen is None:
        readlen = get_read_length(readfile, isAlnmntFile)

    # Initialize count arrays
    base_arr_U = np.zeros((readlen, 5), np.int64)
    qual_arr_U = np.zeros((readlen, max_qual+1), np.int64)
    if twoColumns:
        base_arr_A = np.zeros((readlen, 5), np.int64)
        qual_arr_A = np.zeros((readlen, max_qual+1), np.int64)

    # Main counting loop
    i = 0
    try:
        for a in readfile:
            if isAlnmntFile:
                r = a.read
            else:
                r = a

            # Exclude non-primary alignments if requested
            if isAlnmntFile and primary_only:
                if a.aligned and a.not_primary_alignment:
                    continue

            if twoColumns and isAlnmntFile and a.aligned:
                r.add_bases_to_count_array(base_arr_A)
                r.add_qual_to_count_array(qual_arr_A)
            else:
                r.add_bases_to_count_array(base_arr_U)
                r.add_qual_to_count_array(qual_arr_U)

            i += 1

            if i == max_records:
                break

            if (i % 200000) == 0:
                if (not isAlnmntFile) or primary_only:
                    print(i, "reads processed")
                else:
                    print(i, "alignments processed")

    except:
        sys.stderr.write("Error occured in: %s\n" %
                         readfile.get_line_number_string())
        raise

    if (not isAlnmntFile) or primary_only:
        print(i, "reads processed")
    else:
        print(i, "alignments processed")

    # Normalize result
    def norm_by_pos(arr):
        arr = np.array(arr, np.float64)
        arr_n = (arr.T / arr.sum(1)).T
        arr_n[arr == 0] = 0
        return arr_n

    def norm_by_start(arr):
        arr = np.array(arr, np.float64)
        arr_n = (arr.T / arr.sum(1)[0]).T
        arr_n[arr == 0] = 0
        return arr_n

    result = {
        'isAlnmntFile': isAlnmntFile,
        'readlen': readlen,
        'twoColumns': twoColumns,
        'base_arr_U_n': norm_by_pos(base_arr_U),
        'qual_arr_U_n': norm_by_start(qual_arr_U),
        'nreads_U': base_arr_U[0, :].sum(),
        }

    if twoColumns:
        result['base_arr_A_n'] = norm_by_pos(base_arr_A)
        result['qual_arr_A_n'] = norm_by_start(qual_arr_A)
        result['nreads_A'] = base_arr_A[0, :].sum()

    return result
Esempio n. 3
0
def count_reads(start_codon_sites, stop_codon_sites, ORF_features, counts,
                map_file, stranded, min_quality, count_mode,
                first_exclude_codons, last_exclude_codons, min_read, max_read,
                exclude_min_ORF):

    lowqual = 0
    notaligned = 0
    nonunique = 0
    too_short = 0
    too_long = 0
    min_read_string = "__too_short(<%i)" % min_read
    max_read_string = "__too_long(<%i)" % max_read
    first_exclude_nt = first_exclude_codons * 3
    last_exclude_nt = last_exclude_codons * 3

    pysam_fh = pysam.AlignmentFile(map_file)
    is_bam = pysam_fh.is_bam
    pysam_fh.close()
    if is_bam:
        tracks = HTSeq.BAM_Reader(map_file)
    else:
        tracks = HTSeq.SAM_Reader(map_file)
    # for i,r in enumerate(tracks):
    for r in tracks:
        # if i % 100000 == 0:
        # 	sys.stderr.write("%d alignment record processed.\r" % i)
        if not r.aligned:
            notaligned += 1
            continue
        try:
            if r.optional_field("NH") > 1:
                nonunique += 1
                continue
        except KeyError:
            pass
        if r.aQual < min_quality:
            lowqual += 1
            continue
        read_len = len(r.read.seq)
        if read_len < min_read:
            too_short += 1
            continue
        if read_len > max_read:
            too_long += 1
            continue
        if stranded != "reverse":
            iv_seq = (co.ref_iv for co in r.cigar
                      if co.type == "M" and co.size > 0)
        else:
            iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                      if co.type == "M" and co.size > 0)

        try:
            if count_mode == "intersection-strict":
                fs = None
                for iv in iv_seq:
                    for iv2, fs2 in ORF_features[iv].steps():
                        if fs is None:
                            fs = fs2.copy()
                        else:
                            fs = fs.intersection(fs2)
            elif count_mode == "union":
                fs = set()
                for iv in iv_seq:
                    for iv2, fs2 in ORF_features[iv].steps():
                        fs = fs.union(fs2)
            if fs is None or len(fs) == 0:
                continue
            elif len(fs) > 1:
                continue
            else:
                orf_id = list(fs)[0]
                if read_len < exclude_min_ORF:
                    counts[orf_id] += 1
                    continue
                try:
                    if abs(start_codon_sites[orf_id] -
                           r.iv.start_d) < first_exclude_nt:
                        continue
                    elif abs(r.iv.end_d -
                             stop_codon_sites[orf_id]) < last_exclude_nt:
                        continue
                    else:
                        counts[orf_id] += 1
                except:
                    counts[orf_id] += 1
        except:
            sys.stderr.write(
                "Error occurred when processing mapping file in line:%s\n" %
                r.get_sam_line())
    counts["__too_low_quality"] += lowqual
    counts["__not_aligned"] += notaligned
    counts[min_read_string] += too_short
    counts[max_read_string] += too_long
    counts["__alignment_not_unique"] += nonunique

    return counts
Esempio n. 4
0
def Get_IPAevent(input_tuple):
    label,all_bamfiles = input_tuple
    curr_label_all_gas = []
    curr_label_all_ga = []
    curr_label_all_gene_count = []
    IPA_result = []
    min_count = 30
    for bamfile in all_bamfiles:
        bam_reader = HTSeq.BAM_Reader(bamfile)
        gas,ga,gene_count = Get_label_information(label,annot,bam_reader)
        curr_label_all_gas.append(gas)
        curr_label_all_ga.append(ga)
        curr_label_all_gene_count.append(gene_count)
    for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]:
        if feature == "intron" and int(length)>250:
            intron_start = start
            intron_end = end
            end_value = 15
            index_list = [index for index,gene_count in enumerate(curr_label_all_gene_count) if gene_count[('intron',rank)]> min_count]
            if index_list != []:
                iv = HTSeq.GenomicInterval(chrom,intron_start,intron_end,strand)
                IPAtype = "Composite"
                curr_label_all_cov = []
                for index in index_list:
                    if strand == "-":
                        curr_label_all_cov.append(list(curr_label_all_ga[index][iv])[::-1])
                    else:
                        curr_label_all_cov.append(list(curr_label_all_ga[index][iv]))
                intron_region = chrom+":"+str(intron_start)+"-"+str(intron_end)
                skipend_dict_list = [Get_Skipend_dict(intron_region,bamfile,strand) for bamfile in all_bamfiles]
                for index,skipend_dict in enumerate(skipend_dict_list):
                    for key,value in skipend_dict.items():
                        if int(start)+50 < int(key) < int(end)-50 and int(value) > 10:
                            if strand == "+":
                                skip_position = int(key)-int(start)
                            else:
                                skip_position = int(end)-int(key)
                            curr_label_all_cov = [cvg_region[skip_position:] for cvg_region in curr_label_all_cov]
                            IPAtype = "Skipped"
                            start = int(key)
                            end = int(key)
                            end_value = int(value)
                            break
                    else:
                        continue
                    break
                min_mseratio_list,min_mse_point_list = Get_min_mseratio_list(curr_label_all_cov)
                min_mseratio = min(min_mseratio_list)
                min_mseratio_index = min_mseratio_list.index(min_mseratio)
                if min_mseratio < 0.5:
                    min_mseratio_list_refine,min_mse_point_list_refine = Get_min_mseratio_list_refine(curr_label_all_cov,min_mse_point_list[min_mseratio_index])
                    min_mseratio_refine = min(min_mseratio_list_refine)
                    min_mseratio_index_refine = min_mseratio_list_refine.index(min_mseratio_refine)
                    IPA_point = int(min_mse_point_list_refine[min_mseratio_index_refine])
                    up_down_diff = max([np.mean(coverage[:IPA_point])-np.mean(coverage[IPA_point:]) for coverage in curr_label_all_cov])
                    upstream_cov = max([len(list(filter(lambda x:x>5,coverage[:IPA_point])))/IPA_point for coverage in curr_label_all_cov])
                    downstream_cov = np.mean([len(list(filter(lambda x:x>5,coverage[IPA_point:])))/(len(coverage)-IPA_point) for coverage in curr_label_all_cov])
                    if min_mseratio_refine < 0.5 and up_down_diff > 1 and upstream_cov > 0.8 and downstream_cov < 0.5:
                        if strand == "+":
                            IPA_location = int(start)+IPA_point
                            IPA_inf = chrom+":"+str(start)+"-"+str(IPA_location)
                        else:
                            IPA_location = int(end)-IPA_point
                            IPA_inf = chrom+":"+str(IPA_location)+"-"+str(end)
                        skipstart_dict = Get_Skipstart_dict(intron_region,all_bamfiles,strand)
                        for key,value in skipstart_dict.items():
                            if IPA_location-20<int(key)<IPA_location+20 and int(value) > end_value*0.8:
                                break
                        else:
                            intronPA_inf = label + ";"+feature + "_" + str(rank) + ";" + IPA_inf + ";" +  IPAtype
                            IPA_information = Get_IPAsite_IPUI((intronPA_inf,curr_label_all_ga,gas))
                            IPA_result.append(IPA_information)
    return IPA_result
if args.stranded == 'yes':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=True)
elif args.stranded == 'no':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=False)

for feature in gtf:
    if feature.type == args.type:
        feature_array[feature.iv] += feature.name

print "done.\n\n"

# create Reader class for samfile:
if args.format == 'sam':
    alnmt_file = hts.SAM_Reader(args.alignment_file[0])
else:
    alnmt_file = hts.BAM_Reader(args.alignment_file[0])

# count reads:
print "Counting reads..."

if args.read_type == 'single_end':
    counts = ungapped_se_counter(alnmt_file, feature_array)

    print "\nSample output for ungapped SE counts:"
    countlist = sorted(counts.items())
    for g, c in countlist[-10:]:
        print "%-10s %d" % (g, c)
else:
    counts = ungapped_pe_counter(alnmt_file, feature_array)

    print "\nSample output for ungapped PE counts:"
Esempio n. 6
0
def test_bam_inconsistent_mate():
    print('Test inconsistent BAM file')
    bamfile = HTSeq.BAM_Reader("example_data/inconsistent_mate.bam")
    for read in bamfile:
        pass
    print("Test passed")
Esempio n. 7
0
                                         attribute_label)

    #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE.gtf"
    #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE_exon_filtered.gtf"
    #TE_gtf="/home/daniel/local_data/hg19/annotation/hg19_rmsk_TE_fort2014_CAGEseq.gtf"
    attribute_label = 'transcript_id'
    feature_type = 'exon'
    TE_features = extract_GTF_features(args.TE_gtf, feature_type,
                                       attribute_label)

    TE_first_counts = collections.Counter()
    TE_second_counts = collections.Counter()
    TE_only_counts = collections.Counter()

    #bam_file="/home/daniel/local_data/hipsci/star/test_bam_chr19_sorted.bam"
    almnt_file = HTSeq.BAM_Reader(args.bam_file)

    nUnmapped = 0
    nMultipleAlignments = 0
    nAttributeErrors = 0
    for bundle in HTSeq.pair_SAM_alignments(almnt_file, bundle=True):

        if len(bundle) != 1:
            nMultipleAlignments += 1
            continue  # Skip multiple alignments

        first_almnt, second_almnt = bundle[0]  # extract pair
        if not (first_almnt and second_almnt):
            nUnmapped += 1
            continue
        if (first_almnt.iv is None) or (second_almnt.iv is None):
Esempio n. 8
0
def run(BED, BAMS1, BAMS2, mil_reads):
    sortedbamfile1rep1 = HTSeq.BAM_Reader(BAMS1[0])
    sortedbamfile1rep2 = HTSeq.BAM_Reader(BAMS1[1])
    sortedbamfile2rep1 = HTSeq.BAM_Reader(BAMS2[0])
    sortedbamfile2rep2 = HTSeq.BAM_Reader(BAMS2[1])

    bedfile = list()
    with open(BED) as F:
        for line in F:
            line = line.strip('\n').split('\t')
            chrom, start, stop = line[:3]
            start = int(start)
            stop = int(stop)
            if len(chrom) <= 5:
                #i added a window because it looked like we were missing peaks without it
                if start < 1000:
                    bedfile.append(
                        HTSeq.GenomicInterval(chrom, 0, stop + 1000, '.'))
                #normalizing for length below is not totally accurate because of this, but it's probably okay
                else:
                    bedfile.append(
                        HTSeq.GenomicInterval(chrom, start - 1000, stop + 1000,
                                              '.'))

    counts1rep1 = list()
    for region in bedfile:
        counts1rep1.append(0.0)
        length = region.length + 2000
        for almnt in sortedbamfile1rep1[region]:
            counts1rep1[-1] += 1.0
        counts1rep1[-1] /= (length / 1000.0)
        counts1rep1[-1] /= mil_reads[0][0]

    counts1rep2 = list()
    for region in bedfile:
        counts1rep2.append(0.0)
        length = region.length + 2000
        for almnt in sortedbamfile1rep2[region]:
            counts1rep2[-1] += 1.0
        counts1rep2[-1] /= (length / 1000.0)
        counts1rep2[-1] /= mil_reads[0][1]

    counts2rep1 = list()
    for region in bedfile:
        counts2rep1.append(0.0)
        length = region.length + 2000
        for almnt in sortedbamfile2rep1[region]:
            counts2rep1[-1] += 1.0
        counts2rep1[-1] /= (length / 1000.0)
        counts2rep1[-1] /= mil_reads[1][0]

    counts2rep2 = list()
    for region in bedfile:
        counts2rep2.append(0.0)
        length = region.length + 2000
        for almnt in sortedbamfile2rep2[region]:
            counts2rep2[-1] += 1.0
        counts2rep2[-1] /= (length / 1000.0)
        counts2rep2[-1] /= mil_reads[1][1]

    counts1avg = [(x + y) / 2.0 for x, y in zip(counts1rep1, counts1rep2)]
    counts2avg = [(x + y) / 2.0 for x, y in zip(counts2rep1, counts2rep2)]

    #log10 but excludes 0 (removes both entries)
    counts1avgclean = counts1avg
    counts2avgclean = counts2avg
    #so i decided to find any zero in the first list and make it zero in the second list, then clean it up after
    for i in range(len(counts1avg)):
        if counts1avg[i] == 0.0:
            counts2avgclean[i] = 0.0
        elif counts2avg[i] == 0.0:
            counts1avgclean[i] = 0.0
    counts1avgclean = [x for x in counts1avgclean if x != 0.0]
    counts2avgclean = [x for x in counts2avgclean if x != 0.0]

    counts1avglog = [math.log10(x) for x in counts1avgclean]
    counts2avglog = [math.log10(x) for x in counts2avgclean]

    #2-sample KS test
    KStest = stats.ks_2samp(counts1avglog, counts2avglog)

    #fold change
    foldchange = []
    for i in range(len(counts2avg)):
        try:
            foldchange.append(counts2avg[i] / counts1avg[i])
        except:
            pass

    foldchangelog = []
    for x in foldchange:
        try:
            foldchangelog.append(math.log10(x))
        except:
            pass

    return [counts1avglog, counts2avglog, KStest, foldchangelog]
Esempio n. 9
0
import sys

gtffile = sys.argv[1]
listOfBams = sys.argv[2:]

#print(listOfBams)

exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)
gtf = HTSeq.GFF_Reader(gtffile, end_included=True)

for feature in gtf:
    if feature.type == "exon":
        exons[feature.iv] += feature.name

for bamfile in listOfBams:
    bamObj = HTSeq.BAM_Reader(bamfile)
    for alignment in bamObj:
        if alignment.aligned:
            iset = None
            for interval2, step_set in exons[alignment.iv].steps():
                if iset is None:
                    iset = step_set.copy()
                else:
                    iset.intersection_update(step_set)

                if len(iset) == 1:
                    counts[list(iset)[0]] += 1

for name in sorted(counts.keys()):
    print(name, counts[name])
Esempio n. 10
0
def mapping_reads2shared_exons_introns(refGene_txt, bam_filename, minaqual,
                                       stranded, order, max_buffer_size):
    # initialise counters
    counts = {}
    counts['_empty'] = 0
    counts['_ambiguous'] = 0
    counts['_lowaqual'] = 0
    counts['_notaligned'] = 0
    counts['_ambiguous_readpair_position'] = 0

    # Read BAM file
    bam_reader = HTSeq.BAM_Reader(bam_filename)
    # CIGAR match characters (including alignment match, sequence match, and sequence mismatch
    cigar_char = ('M', '=', 'X')
    # (Refer to HTSeq-count)strand-associated
    stranded_boolean = stranded == 'yes' or stranded == 'reverse'
    reverse_boolean = stranded == 'reverse'

    def invert_strand(iv):
        iv2 = iv.copy()
        if iv2.strand == "+":
            iv2.strand = "-"
        elif iv2.strand == "-":
            iv2.strand = "+"
        else:
            raise ValueError("Illegal strand")
        return iv2

    sys.stdout.write(
        "Gene\tfeature\trank\tposition\tlength\tread_counts\tread_counts_norm\tcoverage(%)\n"
    )

    annot = collections.OrderedDict()
    for line in open(refGene_txt):
        gene_label, feature, rank, position, length = line.strip().split('\t')
        chrom, iv_str, strand = position.strip().split(':')
        start, end = map(int, iv_str.strip().split('-'))
        annot.setdefault(gene_label, []).append(
            (feature, int(rank), chrom, start, end, strand, int(length)))

    for gene_name in annot:
        gene_count = {}
        gas = HTSeq.GenomicArrayOfSets("auto", stranded=stranded_boolean)
        ga = HTSeq.GenomicArray("auto",
                                stranded=stranded_boolean,
                                typecode="i")
        cvg_list = []

        # Annotation
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            gas[iv] += (feature, rank)
            gene_count[(feature, rank)] = 0

        # 直接对bam_reader取iter有问题,作者说是pysam的bug导致的。修正:加fetch
        boundary_left, boundary_right = min(
            [i[3]
             for i in annot[gene_name]]), max([i[4] for i in annot[gene_name]])
        region_fetch = annot[gene_name][0][2] + ':' + str(
            int(boundary_left) - 500) + '-' + str(int(boundary_right) + 500)
        read_seq = bam_reader.fetch(region=region_fetch)

        # distinguish SE and PE mode:
        read_seq_iter = iter(bam_reader.fetch())
        one_read = next(read_seq_iter)
        pe_mode = one_read.paired_end

        if pe_mode:
            if order == 'name':
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == 'pos':
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                    read_seq, max_buffer_size=max_buffer_size)
            else:
                raise ValueError("Illegal order name.")

        # Mapping
        for a in read_seq:
            if not pe_mode:
                if not a.aligned:
                    counts['_notaligned'] += 1
                    continue
                if a.optional_field('NH') > 1:
                    continue
                if a.aQual < minaqual:
                    counts['_lowaqual'] += 1
                    continue
                if not reverse_boolean:
                    iv_seq = (cigop.ref_iv for cigop in a.cigar
                              if cigop.type == "M" and cigop.size > 0)
                else:
                    iv_seq = (invert_strand(cigop.ref_iv) for cigop in a.cigar
                              if cigop.type in cigar_char and cigop.size > 0)
            # pe mode
            else:
                if ((a[0] and a[0].aQual < minaqual)
                        or (a[1] and a[1].aQual < minaqual)):
                    counts['_lowaqual'] += 1
                    continue
                if ((a[0] and a[0].optional_field('NH') > 1)
                        or (a[1] and a[1].optional_field('NH') > 1)):
                    continue
                if a[0] is not None and a[0].aligned:
                    if not reverse_boolean:
                        iv_seq = (
                            cigop.ref_iv for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                    else:
                        iv_seq = (
                            invert_strand(cigop.ref_iv) for cigop in a[0].cigar
                            if cigop.type in cigar_char and cigop.size > 0)
                else:
                    iv_seq = tuple()
                if a[1] is not None and a[1].aligned:
                    if not reverse_boolean:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq,
                            (cigop.ref_iv for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))

            feature_aligned = set()
            for iv in iv_seq:
                for iv2, val2 in gas[iv].steps():
                    feature_aligned |= val2
                    ga[iv] += 1  # for calculating coverage
            if len(feature_aligned) == 0:
                counts['_empty'] += 1
                continue
            # when mapping to intron, discard exons
            for f in [item for item in feature_aligned if item[0] == 'intron']:
                gene_count[f] += 1
            # when no mapping to intron, count all exons
            if 'intron' not in [x for x, y in feature_aligned]:
                for f in feature_aligned:
                    gene_count[f] += 1

        res = []
        for feature, rank, chrom, start, end, strand, length in annot[
                gene_name]:
            feature_count = gene_count[(feature, rank)]
            feature_count_norm = feature_count / length * 1000
            # Coverage calculation
            iv = HTSeq.GenomicInterval(chrom, start, end, strand)
            cvg_region = list(ga[iv])
            cvg = len(filter(lambda x: x > 0,
                             cvg_region)) / len(cvg_region) * 100
            res.append([
                feature, rank, chrom, start, end, strand, length,
                feature_count, feature_count_norm, cvg
            ])

        # Output
        for feature, rank, chrom, start, end, strand, length, feature_count, feature_count_norm, cvg in res:
            pos = "%s:%d-%d:%s" % (chrom, start, end, strand)
            sys.stdout.write('\t'.join(
                map(str, [
                    gene_name, feature, rank, pos, length, feature_count,
                    feature_count_norm, cvg
                ])) + '\n')

    for fn in counts.keys():
        sys.stderr.write('%s\t%d\n' % (fn, counts[fn]))
# read in gtf and create a Genomic Array of Sets for all exons we find
viral_gtf_path = HTSeq.GFF_Reader(args.viral_gtf_path)
exons = HTSeq.GenomicArrayOfSets('auto', stranded=True)

# get all contigs from the gtf file
viral_gtf_contigs = {f.iv.chrom for f in viral_gtf_path}

for feature in viral_gtf_path:
    if feature.type == 'exon':
        exons[feature.iv] += feature.attr['gene_id']  # add gene id to this feature's coordinates in the exons array

# get alignments by umi by cell barcode
alignments_by_umi_by_cell_barcode = dict()
umi_to_ignore_by_cell_barcode = dict()  # cell-umi barcodes that map to non-viral contigs
for almnt in HTSeq.BAM_Reader(args.bam_path):

    assert isinstance(almnt, HTSeq.SAM_Alignment)

    # ignore secondary alignments and unmapped reads
    if not almnt.aligned or almnt.not_primary_alignment:
        continue

    # ignore alignments with invalid cell barcode or umi
    tags_present = {kv[0] for kv in almnt.optional_fields}
    if 'CB' not in tags_present or 'UB' not in tags_present:
        continue

    cell_barcode = almnt.optional_field('CB').split('-')[0]  # get cell barcode

    # ignore cells not mapped to a predicted real cell
def readChrwithBam():
    # print(chr)
    reads_dict = {}

    totalsjfile = opt.totalsj
    bamfile = opt.bam
    bam = HTSeq.BAM_Reader(bamfile)

    for eachLine in open(totalsjfile):
        line = eachLine.strip().split("\t")
        # chr7    34247275    34664347    +
        # if line[0] != chr:
        #     continue
        if line[0] == "chrM":
            continue
        if not line[0].startswith("chr"):
            continue

        reads_left = 0
        reads_right = 0

        # if int(line[4])<opt.sjreads:
        #     continue

        s = int(line[1])
        e = int(line[2])

        iv1 = HTSeq.GenomicInterval(line[0], s, s + opt.span, line[3])
        iv2 = HTSeq.GenomicInterval(line[0], e - opt.span, e, line[3])

        name = line[0] + "\t" + line[1] + "\t" + line[2]

        # chr = name.split("\t")[0]

        iv = iv1
        usedreads = {}
        for r in bam[iv]:
            flag = 0
            for co in r.cigar:
                if co.type == "N":
                    flag = 1
                    break
            if flag == 1:
                continue
            # if r.iv.strand != iv.strand:
            #     continue
            if ((r.iv.strand != iv.strand and (not r.paired_end))
                    or (r.paired_end and r.iv.strand != iv.strand
                        and r.pe_which == "first")
                    or (r.paired_end and r.iv.strand == iv.strand
                        and r.pe_which == "second")):
                continue

            if r.iv.start < iv.start and r.iv.end >= iv.end:
                r_name = r.read.name
                if r_name in usedreads:
                    continue
                else:
                    usedreads[r.read.name] = ""
                    reads_left += 1
        # print(reads_left)

        iv = iv2
        usedreads = {}
        for r in bam[iv]:
            flag = 0
            for co in r.cigar:
                if co.type == "N":
                    flag = 1
                    break
            if flag == 1:
                continue
            # if r.iv.strand != iv.strand:
            #     continue
            if ((r.iv.strand != iv.strand and (not r.paired_end))
                    or (r.paired_end and r.iv.strand != iv.strand
                        and r.pe_which == "first")
                    or (r.paired_end and r.iv.strand == iv.strand
                        and r.pe_which == "second")):
                continue
            if r.iv.start <= iv.start and r.iv.end > iv.end:
                r_name = r.read.name
                if r_name in usedreads:
                    continue
                else:
                    usedreads[r.read.name] = ""
                    reads_right += 1
        # print(reads_right)

        if name not in sjnum:
            sjnum[name] = "0"
        # print(d[c]["left"])
        w.writelines(eachLine.strip() + "\t" + sjnum[name] + "\t")
        if line[3] == "+":
            w.writelines(str(reads_left) + "\t" + str(reads_right) + "\n")
        else:
            w.writelines(str(reads_right) + "\t" + str(reads_left) + "\n")
Esempio n. 13
0
File: qa.py Progetto: wkopp/htseq
def main():

    try:
        import matplotlib
    except ImportError:
        sys.stderr.write("This script needs the 'matplotlib' library, which ")
        sys.stderr.write("was not found. Please install it." )
    matplotlib.use('PDF')
    from matplotlib import pyplot

    # Matplotlib <1.5 uses normalize, so this block will be deprecated
    try:
        from matplotlib.pyplot import Normalize
    except ImportError:
        from matplotlib.pyplot import normalize as Normalize


    # **** Parse command line ****

    optParser = optparse.OptionParser( usage = "%prog [options] read_file",
        description=
        "This script take a file with high-throughput sequencing reads " +
        "(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
        "_sequence.txt) and performs a simply quality assessment by " +
        "producing plots showing the distribution of called bases and " +
        "base-call quality scores by position within the reads. The " +
        "plots are output as a PDF file.",
        epilog =
        "Written by Simon Anders ([email protected]), European Molecular Biology " +
        " Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
        " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
    optParser.add_option( "-t", "--type", type="choice", dest="type",
        choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
        default = "sam", help="type of read_file (one of: sam [default], bam, " +
        "solexa-export, fastq, solexa-fastq)" )
    optParser.add_option( "-o", "--outfile", type="string", dest="outfile",
        help="output filename (default is <read_file>.pdf)" )
    optParser.add_option( "-r", "--readlength", type="int", dest="readlen",
        help="the maximum read length (when not specified, the script guesses from the file" )
    optParser.add_option( "-g", "--gamma", type="float", dest="gamma",
        default = 0.3,
        help="the gamma factor for the contrast adjustment of the quality score plot" )
    optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit",
        help="do not split reads in unaligned and aligned ones" )
    optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41,
        help="the maximum quality score that appears in the data (default: 41)" )

    if len( sys.argv ) == 1:
        optParser.print_help()
        sys.exit(1)

    (opts, args) = optParser.parse_args()

    if len( args ) != 1:
        sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" )
        sys.stderr.write( "  Call with '-h' to get usage information.\n" )
        sys.exit( 1 )

    readfilename = args[0]

    if opts.type == "sam":
        readfile = HTSeq.SAM_Reader( readfilename )
        isAlnmntFile = True
    elif opts.type == "bam":
        readfile = HTSeq.BAM_Reader( readfilename )
        isAlnmntFile = True
    elif opts.type == "solexa-export":
        readfile = HTSeq.SolexaExportReader( readfilename )
        isAlnmntFile = True
    elif opts.type == "fastq":
        readfile = HTSeq.FastqReader( readfilename )
        isAlnmntFile = False
    elif opts.type == "solexa-fastq":
        readfile = HTSeq.FastqReader( readfilename, "solexa" )
        isAlnmntFile = False
    else:
        sys.error( "Oops." )

    twoColumns = isAlnmntFile and not opts.nosplit

    if opts.outfile is None:
        outfilename = os.path.basename( readfilename ) + ".pdf"
    else:
        outfilename = opts.outfile


    # **** Get read length ****

    if opts.readlen is not None:
        readlen = opts.readlen
    else:
        readlen = 0
        if isAlnmntFile:
            reads = ( a.read for a in readfile )
        else:
            reads = readfile
        for r in islice( reads, 10000 ):
            if len( r ) > readlen:
                readlen = len( r )

    max_qual = opts.maxqual
    gamma = opts.gamma


    # **** Initialize count arrays ****

    base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int )
    qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
    if twoColumns:
        base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int )
        qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )


    # **** Main counting loop ****

    i = 0
    try:
        for a in readfile:
            if isAlnmntFile:
                r = a.read
        else:
            r = a
        if twoColumns and (isAlnmntFile and a.aligned):
            r.add_bases_to_count_array( base_arr_A )
            r.add_qual_to_count_array( qual_arr_A )
        else:
            r.add_bases_to_count_array( base_arr_U )
            r.add_qual_to_count_array( qual_arr_U )
        i += 1
        if (i % 200000) == 0:
            print(i, "reads processed")
    except:
        sys.stderr.write( "Error occured in: %s\n" %
            readfile.get_line_number_string() )
        raise
    print(i, "reads processed")


    # **** Normalize result ****

    def norm_by_pos( arr ):
        arr = numpy.array( arr, numpy.float )
        arr_n = ( arr.T / arr.sum( 1 ) ).T
        arr_n[ arr == 0 ] = 0
        return arr_n

    def norm_by_start( arr ):
        arr = numpy.array( arr, numpy.float )
        arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
        arr_n[ arr == 0 ] = 0
        return arr_n


    base_arr_U_n = norm_by_pos( base_arr_U )
    qual_arr_U_n = norm_by_start( qual_arr_U )
    nreads_U = base_arr_U[0,:].sum()
    if twoColumns:
        base_arr_A_n = norm_by_pos( base_arr_A )
        qual_arr_A_n = norm_by_start( qual_arr_A )
        nreads_A = base_arr_A[0,:].sum()


    # **** Make plot ****

    def plot_bases( arr ):
        xg = numpy.arange( readlen )
        pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
        pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
        pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
        pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
        pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
        pyplot.axis( (0, readlen-1, 0, 1 ) )
        pyplot.text( readlen*.70, .9, "A", color="red" )
        pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
        pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
        pyplot.text( readlen*.85, .9, "T", color="orange" )
        pyplot.text( readlen*.90, .9, "N", color="grey" )

    pyplot.figure()
    pyplot.subplots_adjust( top=.85 )
    pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' )

    if twoColumns:

        pyplot.subplot( 221 )
        plot_bases( base_arr_U_n )
        pyplot.ylabel( "proportion of base" )
        pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %
        ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )

        pyplot.subplot( 222 )
        plot_bases( base_arr_A_n )
        pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %
        ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )

        pyplot.subplot( 223 )
        pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )
        pyplot.ylabel( "base-call quality score" )

        pyplot.subplot( 224 )
        pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )

    else:

        pyplot.subplot( 211 )
        plot_bases( base_arr_U_n )
        pyplot.ylabel( "proportion of base" )
        pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )

        pyplot.subplot( 212 )
        pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )
        pyplot.ylabel( "base-call quality score" )


    pyplot.savefig( outfilename )
Esempio n. 14
0
         0x4 : "segment unmapped",
         0x8 : "next segment in the template unmapped",
         0x10 : "SEQ being reverse complemented",
         0x20 : "SEQ of the next segment in the template being reversed",
         0x40 : "the first segment in the template",
         0x80 : "the last segment in the template",
         0x100 : "secondary alignment",
         0x200 : "not passing quality controls",
         0x400 : "PCR or optical duplicate",
         0x800 : "supplementary alignment"}


mate_mapped_same_chr, mate_mapped_dif_chr, mate_mapped_dif_chr_a5 = 0,0,0
unmapped, paired, read1, read2, properly, duplicate, total = 0, 0, 0, 0, 0, 0, 0

bamfile = HTSeq.BAM_Reader(bam)
for almnt in bamfile:
	if almnt.aligned:
		if almnt.flag & 0x900 == 0:
			total += 1
			if almnt.flag & 0x400 != 0:
				duplicate += 1
			if almnt.mate_aligned:
				paired +=1
				if almnt.proper_pair:
					properly += 1
				if almnt.iv.chrom == almnt.mate_start.chrom:
					mate_mapped_same_chr += 1
				else:
					mate_mapped_dif_chr += 1
					if almnt.aQual >= 5:
Esempio n. 15
0
#!/usr/bin/python
import os, sys, HTSeq

bam = HTSeq.BAM_Reader(sys.argv[1])

for each in bam:
    if each.aligned and each.mate_aligned:
        if each.pe_which == 'first':
            print abs(each.inferred_insert_size)
Esempio n. 16
0
def readChr_unstrand(chr, reads):
    print(chr)
    reads_dict = {}
    reads_dict["left"] = {}
    reads_dict["right"] = {}

    totalsjfile = opt.totalsj
    bamfile = opt.bam
    bam = HTSeq.BAM_Reader(bamfile)

    reads_dict["left"] = {}
    reads_dict["right"] = {}

    i = 0
    j = 0
    for eachLine in open(totalsjfile):
        line = eachLine.strip().split("\t")
        # chr7    34247275    34664347    +
        if line[0] != chr:
            continue
        # print(eachLine)

        j += 1
        if j > 0 and j % 1000 == 0:
            sys.stderr.write("%s : %d sj processed.\n" % (chr, j))

        i += 1
        key = str(i)
        # if line[0] == "chrM":
        #     continue
        # if not line[0].startswith("chr"):
        #     continue

        reads_left = 0
        reads_right = 0

        lss = line[0] + ":" + line[1] + ":" + line[3]
        rss = line[0] + ":" + line[2] + ":" + line[3]

        # if int(line[4])<opt.sjreads:
        #     continue

        s = int(line[1])
        e = int(line[2])

        iv1 = HTSeq.GenomicInterval(line[0], s - 1, s + opt.span, ".")
        iv2 = HTSeq.GenomicInterval(line[0], e - 1 - opt.span, e, ".")

        name = line[0] + "\t" + line[1] + "\t" + line[2]

        # chr = name.split("\t")[0]

        if lss in reads_dict["left"]:
            reads_left = reads_dict["left"][lss]
        else:
            iv = iv1
            usedreads = {}
            # print(">sj iv:")
            # print(iv)
            for r in bam[iv]:
                if r.iv.length > 150:
                    continue
                # print(r.iv)
                flag = 0
                for co in r.cigar:
                    if co.type == "N":
                        flag = 1
                        break
                if flag == 1:
                    continue
                # if r.iv.strand != iv.strand:
                #     continue
                # if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")):
                #     continue

                if r.iv.start < iv.start and r.iv.end >= iv.end:
                    r_name = r.read.name
                    if r_name in usedreads:
                        continue
                    else:
                        usedreads[r.read.name] = ""
                        reads_left += 1
            reads_dict["left"][lss] = reads_left
        # print(reads_left)

        if rss in reads_dict["right"]:
            reads_right = reads_dict["right"][rss]
        else:
            iv = iv2
            usedreads = {}
            for r in bam[iv]:
                if r.iv.length > 150:
                    continue
                flag = 0
                for co in r.cigar:
                    if co.type == "N":
                        flag = 1
                        break
                if flag == 1:
                    continue
                # if r.iv.strand != iv.strand:
                #     continue
                # if ((r.iv.strand != iv.strand and (not r.paired_end)) or (r.paired_end and r.iv.strand != iv.strand and r.pe_which == "first") or (r.paired_end and r.iv.strand == iv.strand and r.pe_which == "second")):
                #     continue
                if r.iv.start <= iv.start and r.iv.end > iv.end:
                    r_name = r.read.name
                    if r_name in usedreads:
                        continue
                    else:
                        usedreads[r.read.name] = ""
                        reads_right += 1
            reads_dict["right"][rss] = reads_right
        # print(reads_right)

        # if name not in sjnum:
        #     sjnum[name] = "0"
        # # print(d[c]["left"])

        # tmp=eachLine.strip() + "\t" + sjnum[name] + "\t"
        # if line[3] == "+":
        #     tmp+=str(reads_left) + "\t" + str(reads_right) + "\n"
        # else:
        #     tmp+=str(reads_right) + "\t" + str(reads_left) + "\n"
        # reads_dict[key] = tmp

    # print(reads_dict)
    reads[chr] = reads_dict.copy()
    del reads_dict

    logging.info("done %s" % chr)
Esempio n. 17
0
    bamfile = outdir + '/Aligned.toTranscriptome.out.bam'


############################################################
# process bam

if not os.path.exists(bamfile):
   print "cannot find bamfile", bamfile
   sys.exit(2)

statfile = os.path.dirname(os.path.abspath(bamfile)) + '/ReadsPerGene.out.tab'

if not os.path.exists(statfile):
   print "Warning: Can't analyze the mapping stat because ", statfile, " not exist"

bam_reader = HTSeq.BAM_Reader(bamfile)

total = 0

print 'readname\ttranscript'
for align in bam_reader:    
    total += 1
    myread = align.read.name
    mytrpt = align.iv.chrom
    print '{}\t{}'.format(myread, mytrpt)

############################################################
# mapping stat

if not os.path.exists(statfile):
   print "cannot find mapping stat file", statfile
Esempio n. 18
0
def count_biotype_overlaps(aligned_bam,
                           selected_features,
                           biotype_count_dict,
                           number_lines=10000000):
    """
    Go thorough an aligned bam, counting overlaps with biotype features
    """

    # Set up filenames & objects
    aligned_bam = os.path.realpath(aligned_bam)
    bamfile = HTSeq.BAM_Reader(aligned_bam)

    # Go through alignments, counting transcript biotypes
    logging.info("\nReading BAM file (will stop at {}): ".format(number_lines))
    aligned_reads = 0
    for i, alnmt in enumerate(bamfile):
        if i > int(number_lines):
            i -= 1
            logging.info(
                "Reached {} lines in the aligned file, exiting..".format(
                    number_lines))
            break
        if i % 1000000 == 0 and i > 0:
            logging.debug("{} lines processed..".format(i))

        if alnmt.aligned:
            aligned_reads += 1
            iset = None
            for iv2, step_set in selected_features[alnmt.iv].steps():
                if iset is None:
                    iset = step_set.copy()
                else:
                    iset.intersection_update(step_set)

            # Feature values were set as biotype label. Overlap with multiple
            # features with the same biotype will give length == 1
            key = 'multiple_features'
            if len(iset) == 1:
                key = list(iset)[0]
            elif len(iset) == 0:
                key = 'no_overlap'

            biotype_count_dict['biotype_counts'][key] += 1
            biotype_count_dict['biotype_lengths'][key][alnmt.iv.length] += 1

    logging.info ("\n{} overlaps found from {} aligned reads ({} reads total)" \
                    .format(aligned_reads-biotype_count_dict['biotype_counts']['no_overlap'], aligned_reads, i))
    logging.info ("{} reads had multiple feature overlaps\n" \
                    .format(biotype_count_dict['biotype_counts']['multiple_features']))

    # Make a string table out of the counts
    counts_string = 'Type\tRead Count\n'
    for biotype in sorted(biotype_count_dict['biotype_counts'],
                          key=biotype_count_dict['biotype_counts'].get,
                          reverse=True):
        if biotype_count_dict['biotype_counts'][biotype] == 0:
            continue
        counts_string += "{}\t{}{}".format(
            biotype, biotype_count_dict['biotype_counts'][biotype], os.linesep)

    # Save to file
    file_basename = os.path.splitext(os.path.basename(aligned_bam))[0]
    counts_file = "{}_biotypeCounts.txt".format(file_basename)
    try:
        with open(counts_file, 'w') as fh:
            print(counts_string, file=fh)
    except IOError as e:
        raise IOError(e)

    # Return the counts
    return biotype_count_dict
Esempio n. 19
0
#We need this little helper below:
def reverse_strand(s):
    if s == "+":
        return "-"
    elif s == "-":
        return "+"
    else:
        raise SystemError, "illegal strand"


# Now go through the aligned reads

if not is_BAM:
    tmp_obj = HTSeq.SAM_Reader(sam_file)
else:
    tmp_obj = HTSeq.BAM_Reader(sam_file)

if not is_PE:

    num_reads = 0
    #   for a in HTSeq.SAM_Reader( sam_file ):
    for a in tmp_obj:
        if not a.aligned:
            counts['_notaligned'] += 1
            continue
        if a.aQual < minaqual:
            counts['_lowaqual'] += 1
            continue
        rs = set()
        for cigop in a.cigar:
            if cigop.type != "M":
Esempio n. 20
0
import datetime

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('rb'),
                        required=True)
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        required=True)
    args = parser.parse_args()

    almnt_file = HTSeq.BAM_Reader(args.input)

    counts = HTSeq.GenomicArray("auto", stranded=False, typecode='i')
    fcounts = HTSeq.GenomicArray("auto", stranded=False, typecode='i')

    curChrom = None
    for almnt in almnt_file:
        if not almnt.aligned or almnt.not_primary_alignment or almnt.supplementary:
            continue

        if curChrom != almnt.iv.chrom:

            dt = datetime.datetime.now()

            print(dt.isoformat(), args.input.name, "Switching Chromosome",
                  curChrom, almnt.iv.chrom)
Esempio n. 21
0
def modifHTSeq(bam_filename, gff_filename, out_file, overlap_mode,
               feature_type, id_attribute, minaqual, exclude_start_distance,
               exclude_stop_distance, min_len, max_len):
    #feature GenomicArrayOfSets
    features = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    counts = {}
    start_codon_sites = {}
    stop_codon_sites = {}
    #GTF
    gff = HTSeq.GFF_Reader(gff_filename, end_included=True)
    i = 0
    for f in gff:
        if f.type == feature_type:
            if id_attribute in f.attr:  #the same to the f.attr.keys()
                feature_id = f.attr[
                    id_attribute]  # f.attr will return the 9-th colum of the input gtf file as {}
            else:
                feature_id = f.attr[
                    'gene_id']  #in the gtf file of Rat, there are some CDS/exon dont have gene_name ,but every items have gene_id
            features[
                f.
                iv] += feature_id  #label the chrmosome with gene_name, if dont have gene_name,replaced by gene_id
            #counts[ f.attr[ id_attribute ] ] = 0 #only counts reads for genes with id_attribute, so cant repaced by counts[ feature_id ] = 0
            counts[feature_id] = 0
        ### if there are multiple TIS, use the most 5' end start codon and the most 3' end stop codon
        if f.type == "start_codon":
            if id_attribute in f.attr:
                gname = f.attr[id_attribute]
            if gname not in start_codon_sites:
                start_codon_sites[gname] = f.iv.start_d
            else:
                if f.iv.strand == "+":
                    start_codon_sites[gname] = min(f.iv.start_d,
                                                   start_codon_sites[gname])
                else:
                    start_codon_sites[gname] = max(f.iv.start_d,
                                                   start_codon_sites[gname])
        #
        if f.type == "stop_codon":
            if id_attribute in f.attr:
                gname = f.attr[id_attribute]
            if gname not in stop_codon_sites:
                stop_codon_sites[gname] = f.iv.end_d
            else:
                if f.iv.strand == "+":
                    stop_codon_sites[gname] = max(f.iv.end_d,
                                                  stop_codon_sites[gname])
                else:
                    stop_codon_sites[gname] = min(f.iv.end_d,
                                                  stop_codon_sites[gname])
        i += 1
        if i % 100000 == 0:
            sys.stderr.write("%d GFF lines processed.\n" % i)
    #bam
    read_seq = HTSeq.BAM_Reader(bam_filename)
    #counts
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0:
            sys.stderr.write("%d SAM alignment record processed.\n" % i)
        i += 1
        if not r.aligned:
            notaligned += 1
            continue
        if r.optional_field("NH") > 1:
            nonunique += 1
            continue
        if r.aQual < minaqual:
            lowqual += 1
            continue
        ###
        if len(r.read.seq) < min_len or len(r.read.seq) > max_len:
            continue
        iv_seq = (co.ref_iv for co in r.cigar
                  if co.type == "M" and co.size > 0)
        if overlap_mode == "union":
            fs = set()
            for iv in iv_seq:
                for iv2, fs2 in features[iv].steps():
                    fs = fs.union(fs2)
        elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
            fs = None
            for iv in iv_seq:
                for iv2, fs2 in features[iv].steps():
                    if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                            fs = fs2.copy()
                        else:
                            fs = fs.intersection(fs2)
        else:
            sys.exit("Illegal overlap mode.")
        if fs is None or len(fs) == 0:
            empty += 1
        elif len(fs) > 1:
            ambiguous += 1
        else:
            try:  #some genes may dont have start or stop codon
                if abs(start_codon_sites[list(fs)[0]] -
                       r.iv.start_d) < exclude_start_distance:
                    continue
                elif abs(r.iv.end_d - stop_codon_sites[list(fs)[0]]
                         ) < exclude_stop_distance:
                    continue
                else:
                    counts[list(fs)[0]] += 1
            except:
                counts[list(fs)[0]] += 1
    #output
    with open(out_file, "w") as fout:
        fout.write("%s\t%s\n" % (id_attribute.strip(), "count"))
        for fn in sorted(counts.keys()):
            fout.write("%s\t%s\n" % (fn, counts[fn]))
        fout.write("__no_feature\t%d\n" % empty)
        fout.write("__ambiguous\t%d\n" % ambiguous)
        fout.write("__too_low_aQual\t%d\n" % lowqual)
        fout.write("__not_aligned\t%d\n" % notaligned)
        fout.write("__alignment_not_unique\t%d\n" % nonunique)
def readChrwithBam(chr, reads):
    print(chr)
    reads_dict = {}
    reads_dict["left"] = {}
    reads_dict["right"] = {}

    usedreads = {}

    totalsjfile = opt.totalsj
    ga = HTSeq.GenomicArrayOfSets([chr], stranded=True)
    ga2 = HTSeq.GenomicArrayOfSets([chr], stranded=True)
    minpos = 10000000000
    maxpos = 0
    for eachLine in open(totalsjfile):
        line = eachLine.strip().split("\t")
        # chr7    34247275    34664347    +
        if line[0] != chr:
            continue
        # chr = line[0]

        lss = line[0] + ":" + line[1] + ":" + line[3]
        rss = line[0] + ":" + line[2] + ":" + line[3]

        reads_dict["left"][lss] = 0
        reads_dict["right"][rss] = 0

        s = int(line[1])
        e = int(line[2])

        if s < minpos:
            minpos = s
        if e > maxpos:
            maxpos = e

        iv1 = HTSeq.GenomicInterval(line[0], s - 1, s, line[3])
        iv2 = HTSeq.GenomicInterval(line[0], e - 1, e, line[3])

        usedreads[iv1] = {}
        usedreads[iv2] = {}

        ga[iv1] += lss
        ga2[iv2] += rss

    # utemp = dict(usedreads).copy()

    bamfile = opt.bam
    bam = HTSeq.BAM_Reader(bamfile)

    giv = HTSeq.GenomicInterval(chr, minpos, maxpos, ".")
    j = 0
    print("start reading bam of " + chr)
    # print(giv)
    for r in bam[giv]:
        j += 1
        if j > 0 and j % 100000 == 0:
            sys.stderr.write("%s : %d sj processed.\n" % (chr, j))
        if j > 0 and j % 1000000 == 0:
            for tiv in usedreads:
                if tiv.start < r.iv.start - 5000:
                    usedreads[tiv].clear()

        r_name = r.read.name
        # print(r_name + ":" + r.iv.strand + ":" + r.pe_which)
        iv_seq = []
        if opt.unstrand:
            iv_seq1 = [
                co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0
            ]
            iv_seq2 = [
                invert_strand(co.ref_iv) for co in r.cigar
                if co.type == "M" and co.size > 0
            ]
            iv_seq = iv_seq1 + iv_seq2
        else:
            if ((not r.paired_end)
                    or (r.paired_end and r.pe_which == "first")):
                iv_seq = [
                    co.ref_iv for co in r.cigar
                    if co.type == "M" and co.size > 0
                ]  # 只读取匹配类型为M的部分,记录在iv_seq
                if (r.paired_end and r.pe_which == "second"):
                    iv_seq = [
                        invert_strand(co.ref_iv) for co in r.cigar
                        if co.type == "M" and co.size > 0
                    ]

        for iv2 in iv_seq:
            # print(iv2)
            for iv, fs in ga[iv2].steps():
                if len(fs) == 1:
                    if iv.start - 1 >= iv2.start and iv.start + opt.span <= iv2.end:
                        if r_name in usedreads[iv]:
                            # print(r_name)
                            continue
                        else:
                            # print(r_name)
                            usedreads[iv][r_name] = True
                        ss = list(fs)[0]
                        reads_dict["left"][ss] += 1
            for iv, fs in ga2[iv2].steps():
                if len(fs) == 1:

                    if iv.start - opt.span >= iv2.start and iv.start + 1 <= iv2.end:
                        if r_name in usedreads[iv]:
                            continue
                        else:
                            # print(r_name)
                            # print(iv2)
                            usedreads[iv][r_name] = True
                        ss = list(fs)[0]
                        reads_dict["right"][ss] += 1

    # print(reads_dict)
    reads[chr] = reads_dict.copy()
    del reads_dict
    del usedreads

    logging.info("done %s" % chr)
Esempio n. 23
0
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss):

    SI_counts = defaultdict(int)
    junction_counts = defaultdict(int)

    for f, s in HTSeq.pair_SAM_alignments_with_buffer(
            HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))):

        if f != None and f.aligned == True and f.aQual > 5:
            chrome = f.iv.chrom
            start = f.iv.start
            end = f.iv.end
            strand = f.iv.strand
            if strand == '+':
                geneint = HTSeq.GenomicPosition(chrome, start, strand)
            else:
                geneint = HTSeq.GenomicPosition(chrome, end, strand)
            if len(targets[geneint]) == 0:
                introns = set()
                junctions = set()

                for i, cigop in enumerate(f.cigar):
                    if cigop.type == 'M':
                        for iv, val in targets[cigop.ref_iv].steps():
                            introns |= val

                    elif cigop.type == 'N':
                        if f.cigar[i - 1].type == 'M' and f.cigar[
                                i - 1].size > 3 and f.cigar[
                                    i +
                                    1].type == 'M' and f.cigar[i + 1].size > 3:
                            for iv, val in targets[cigop.ref_iv].steps():
                                junctions |= val

                            chrom = cigop.ref_iv.chrom
                            if cigop.ref_iv.strand == '+':
                                first = cigop.ref_iv.end
                                second = cigop.ref_iv.start + 1
                                strand = "+"
                            else:
                                first = cigop.ref_iv.start + 1
                                second = cigop.ref_iv.end
                                strand = '-'

                            if (chrom, first,
                                    strand) in fiveSS and (chrom, second,
                                                           strand) in threeSS:
                                up = fiveSS[chrom, first, strand]
                                down = threeSS[chrom, second, strand]
                                if up[0] == down[0]:
                                    if up[1] == down[1]:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Constituitive")] += 1
                                    else:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Exon Skipping")] += 1
                            elif (chrom, first, strand) in fiveSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 3'")] += 1
                            elif (chrom, second, strand) in threeSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 5'")] += 1

                intron_num_mat = {}
                intron_num_pre = {}
                intron = ''
                junction = ''

                if len(introns) > 0:
                    for i in introns:
                        a = i.split(';')
                        intron_num_pre[i] = a[1]
                    intron = max(intron_num_pre.items(), key=lambda x: x[1])
                    intron = intron[0]

                if len(junctions) > 0:
                    for i in junctions:
                        a = i.split(';')
                        intron_num_mat[i] = a[1]
                    junction = max(intron_num_mat.items(), key=lambda x: x[1])
                    junction = junction[0]

                if junction == intron:
                    intron = ''
                    junction = ''

                if junction and intron:
                    if junction.split(';')[1] > intron.split(';')[1]:
                        intron = ''
                    else:
                        junction = ''

                candidate_genes = set()
                for i in introns:
                    candidate_genes.add(i.split(';')[0])
                for i in junctions:
                    candidate_genes.add(i.split(';')[0])

                if len(candidate_genes) == 1:
                    if junction:
                        SI_counts[('mature', junction)] += 1
                    if intron:
                        SI_counts[('premature', intron)] += 1
                    if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5:
                        if junction:
                            SI_counts[('concordant_mature', junction)] += 1
                        if intron:
                            SI_counts[('concordant_premature', intron)] += 1

# Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows
                if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5:
                    chrome = s.iv.chrom
                    start = s.iv.start
                    end = s.iv.end
                    strand = s.iv.strand
                    if strand == '+':
                        geneint = HTSeq.GenomicPosition(chrome, start, strand)
                    else:
                        geneint = HTSeq.GenomicPosition(chrome, end, strand)
                    if intron in Branches[geneint] and len(
                            Branches[geneint]) == 1:
                        SI_counts[('lariat_int', intron)] += 1
                    if intron in Branchto3ss[geneint] and len(
                            Branchto3ss[geneint]) == 1:
                        SI_counts[('branch_to3ss', intron)] += 1

    with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)],
                                        SI_counts[('premature', intron)]))

    with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile),
              'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' %
                      (intron, SI_counts[('concordant_mature', intron)],
                       SI_counts[('concordant_premature', intron)]))

    with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)]))

    with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' %
                      (intron, SI_counts[('branch_to3ss', intron)]))

    with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out:
        out.write('Gene\tUpstream\tDownstream\tType\tCount\n')
        for junc in sorted(junction_counts):
            out.write(
                '%s\t%d\t%d\t%s\t%d\n' %
                (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
Esempio n. 24
0
def count_circrna(args):
    import HTSeq
    import numpy as np
    import pandas as pd
    from collections import OrderedDict, defaultdict
    from ioutils import open_file_or_stdout

    logger.info('read input BAM/SAM file: ' + args.input_file)
    if args.input_file.endswith('.sam'):
        sam = HTSeq.SAM_Reader(args.input_file)
    elif args.input_file.endswith('.bam'):
        sam = HTSeq.BAM_Reader(args.input_file)
    else:
        raise ValueError('unsupported file extension')

    # extract junction positions from SAM header
    logger.info('extract junction positions')
    junction_positions = OrderedDict()
    for sq in sam.get_header_dict()['SQ']:
        junction_positions[sq['SN']] = sq['LN'] // 2
    # initialize counts
    gene_ids = list(junction_positions.keys())
    counts = pd.Series(np.zeros(len(gene_ids), dtype='int'), index=gene_ids)
    # count reads
    min_mapping_quality = args.min_mapping_quality
    strandness = args.strandness
    if args.paired_end:
        logger.info('count paired-end fragments')
        stats = defaultdict(int)
        for bundle in HTSeq.pair_SAM_alignments(sam, bundle=True):
            stats['total_pairs'] += 1
            # ignore multi-mapped pairs
            if len(bundle) != 1:
                stats['multi_mapping'] += 1
                continue
            read1, read2 = bundle[0]
            # ignore singletons
            if (read1 is None) or (read2 is None):
                stats['singleton'] += 1
                continue
            # ignore unmapped reads
            if not (read1.aligned and read2.aligned):
                stats['unmapped'] += 1
                continue
            # ignore pairs with mapping quality below threshold
            if (read1.aQual < min_mapping_quality) or (read2.aQual <
                                                       min_mapping_quality):
                stats['low_mapping_quality'] += 1
                continue
            if (strandness == 'forward') and (not ((read1.iv.strand == '+') and
                                                   (read2.iv.strand == '-'))):
                stats['improper_strand'] += 1
                continue
            if (strandness == 'reverse') and (not ((read1.iv.strand == '-') and
                                                   (read2.iv.strand == '+'))):
                stats['improper_strand'] += 1
                continue
            # ignore pairs on different chromosomes
            if read1.iv.chrom != read2.iv.chrom:
                stats['diff_chrom'] += 1
                continue
            pos = junction_positions[read1.iv.chrom]
            if read1.iv.start < pos <= read2.iv.end:
                counts[read1.iv.chrom] += 1
        for key, val in stats.items():
            logger.info('{}: {}'.format(key, val))
    else:
        logger.info('count single-end reads')
        for read in sam:
            # ignore unmapped read
            if not read.aligned:
                continue
            # ignore reads with mapping quality below threshold
            if read.aQual < min_mapping_quality:
                continue
            if (strandness == 'forward') and (read.iv.strand == '-'):
                continue
            if (strandness == 'reverse') and (not ((read.iv.strand == '+'))):
                continue
            pos = junction_positions[read.iv.chrom]
            if read.iv.start < pos <= read.iv.end:
                counts[read.iv.chrom] += 1
    # output counts
    logger.info('count fragments: {}'.format(counts.sum()))
    logger.info('write counts to file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        counts.to_csv(fout, sep='\t', header=None, index=True, na_rep='NA')
Esempio n. 25
0
def centipede_footprint(bed_file,
                        bam_file,
                        sites,
                        sample_name,
                        plots_dir,
                        fragmentsize=1,
                        orientation=True,
                        duplicates=True,
                        strand_specific=True):
    """
    Gets read coverage in genomic intervals. Passes coverage to centipede_call_footprints and returns posterior probabilities.

    :param bed_file: Bed file.
    :type bed_file: str
    :param bam: HTSeq.BAM_Reader object, must be sorted and indexed with .bai file.
    :type bam: HTSeq.BAM_Reader
    :type fragmentsize: int
    :type stranded: bool
    :type duplicates: bool
    :returns: OrderedDict with regionName:numpy.array(coverage)
    :rtype: collections.OrderedDict
    """
    import pybedtools
    import os
    import HTSeq
    import numpy as np

    # read in bedfile
    motifs = pybedtools.BedTool(bed_file)
    # get motif name
    motif_name = os.path.basename(bed_file.split(".")[0])
    # get motif length (length of first interval)
    motif_length = motifs[0].length

    # convert intervals to HTSeq.GenomicInterval
    intervals = map(bedtools_interval_to_genomic_interval, motifs)

    # Handle bam file
    bam = HTSeq.BAM_Reader(bam_file)

    # exclude bad chroms
    chroms_exclude = ['chrM', 'chrX', 'chrY']

    # get dimensions of matrix to store profiles of Tn5 transposition
    n = len(intervals)
    m = intervals[0].length

    # create empty matrix
    if not strand_specific:
        coverage = np.zeros((n, m), dtype=np.float64)
    else:
        # if "strand_specific", get signal for both strands independently, but concatenated
        coverage = np.zeros((n, m * 2), dtype=np.float64)

    # Loop through intervals, get coverage, increment matrix count
    for i, feature in enumerate(intervals):
        # counter just to track
        if i % 1000 == 0:
            print(n - i)

        # Check if feature is not in bad chromosomes
        if feature.chrom in chroms_exclude:
            continue

        # Fetch alignments in interval
        for aln in bam[feature]:
            # check it's aligned
            if not aln.aligned:
                continue

            # check if duplicate
            if not duplicates and aln.pcr_or_optical_duplicate:
                continue

            aln.iv.length = fragmentsize  # adjust reads to specified size

            # get position relative to window if required (motif-oriented)
            if orientation:
                if feature.strand == "+" or feature.strand == ".":
                    start_in_window = aln.iv.start - feature.start - 1
                    end_in_window = aln.iv.end - feature.start - 1
                else:
                    start_in_window = feature.length - abs(feature.start -
                                                           aln.iv.end) - 1
                    end_in_window = feature.length - abs(feature.start -
                                                         aln.iv.start) - 1
            else:
                start_in_window = aln.iv.start - feature.start - 1
                end_in_window = aln.iv.end - feature.start - 1

            # check fragment is within window; this is because of fragmentsize adjustment
            if start_in_window < 0 or end_in_window > feature.length:
                continue

            # add +1 to all positions overlapped by read within window
            if not strand_specific:
                coverage[i, start_in_window:end_in_window] += 1
            else:
                if aln.iv.strand == "+":
                    coverage[i, start_in_window:end_in_window] += 1
                else:
                    coverage[i, m + start_in_window:m + end_in_window] += 1
    # Call footprints, get posterior probabilities
    try:
        probs = centipede_call_footprints(
            coverage, np.ones([len(coverage), 1]), motif_length,
            os.path.join(plots_dir, sample_name + "." + motif_name + ".pdf"))
        if len(probs) != len(coverage):
            probs = np.zeros(len(coverage))
    except:
        # if error, return zeros
        probs = np.zeros(len(coverage))
    return probs
Esempio n. 26
0
#example 3
# sample_name = 'yuww165'
# gene_sym = 'WASH7P'
# sj_pos = 'chr1:17055-17605'

#example 4
# sample_name = 'yuhimo'
# # sample_name = 'yukadi'
# gene_sym = 'LOC100132287'
# sj_pos = 'chr1:27623647-27624428'

#example 5
sample_name = 'gapi'
# sample_name = 'yukadi'
gene_sym = 'PSMC6'
sj_pos = 'chr14:53185756-53190682'

#retrieve the mapped reads to see which reads uniquely mapped
path_bam = DIR_RNASEQ + "/tophat_sample_" + sample_name + "/accepted_hits.bam"
bam_reader = HTSeq.BAM_Reader(path_bam)
pysam_file = pysam.AlignmentFile(path_bam, 'rb')

# hash_htseq = sj_read_support( bam_reader, sj_pos )
# print "see read counts: ", hash_htseq

# sj_read_support_TEST( bam_reader, sj_pos )

sj_read_support_variety_reads(bam_reader, sj_pos)

print "------------ TDD Completed: 170601_SJ_Metrics_V2.py ------------"
Esempio n. 27
0
import sys
import HTSeq
import numpy
import matplotlib as mpl
mpl.use('pdf')
from matplotlib import pyplot

#bamfile = HTSeq.BAM_Reader( "Chr1.unique.bam" )
#sortedbamfile = HTSeq.BAM_Reader( "../input/Nucleosome.Chr1.unique.bam" )
#sortedbamfile = HTSeq.BAM_Reader( "../input/DHS.unique.bam" )
#gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.gtf" )
sortedbamfile = HTSeq.BAM_Reader(sys.argv[1])
gtffile = HTSeq.GFF_Reader(sys.argv[2])

halfwinwidth = 2000
fragmentsize = 73
readlen = 36
#total = 60745783.00/1000000 ## nucleosome
#total = 7480914/1000000 ## nucleosome chr1
#total = 23299296/1000000 #DHS unique
#gsize = 372000000

#coverage = HTSeq.GenomicArray( "auto", stranded=False, typecode="i" )
#for almnt in bamfile:
#   if almnt.aligned:
#      #almnt.iv.length = fragmentsize
#      print almnt.iv
#      if not almnt.iv.start < 500:
#          coverage[ almnt.iv ] += 1

tsspos = set()
Esempio n. 28
0
import sys
import HTSeq
import numpy
import matplotlib as mpl
mpl.use('pdf')
from matplotlib import pyplot

#bamfile = HTSeq.BAM_Reader( "Chr1.unique.bam" )
#bamfile = HTSeq.BAM_Reader( "../input/Nucleosome.unique.bam" )
bamfile = HTSeq.BAM_Reader("../input/DHS.Chr1.unique.bam")
#gtffile = HTSeq.GFF_Reader( "../input/MSU7.gene.exon_number.HighExp.gtf" )
gtffile = HTSeq.GFF_Reader(sys.argv[1])

halfwinwidth = 2000
fragmentsize = 150
total = 60745783.00 / 1000000  ## nucleosome
gsize = 372000000

coverage = HTSeq.GenomicArray("auto", stranded=False, typecode="i")
for almnt in bamfile:
    if almnt.aligned:
        #almnt.iv.length = fragmentsize
        #print almnt.iv
        if not almnt.iv.start < 500:
            coverage[almnt.iv] += 1

tsspos = set()
for feature in gtffile:
    if feature.type == "exon" and feature.attr["exon_number"] == "-1":
        #print feature.iv.start_d_as_pos.pos
        if feature.iv.start_d_as_pos.pos > 5000:
Esempio n. 29
0
def count_reads_with_barcodes(
        sam_filename,
        features,
        feature_attr,
        order,
        max_buffer_size,
        stranded,
        overlap_mode,
        multimapped_mode,
        secondary_alignment_mode,
        supplementary_alignment_mode,
        feature_type,
        id_attribute,
        additional_attributes,
        quiet,
        minaqual,
        samout_format,
        samout_filename,
        cb_tag,
        ub_tag,
        ):

    def write_to_samout(r, assignment, samoutfile, template=None):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                if samout_format in ('SAM', 'sam'):
                    samoutfile.write(read.get_sam_line() + "\n")
                else:
                    samoutfile.write(read.to_pysam_AlignedSegment(template))

    def identify_barcodes(r):
        '''Identify barcode from the read or pair (both must have the same)'''
        if not pe_mode:
            r = (r,)
        # cell, UMI
        barcodes = [None, None]
        nbar = 0
        for read in r:
            if read is not None:
                for tag, val in read.optional_fields:
                    if tag == cb_tag:
                        barcodes[0] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
                    elif tag == ub_tag:
                        barcodes[1] = val
                        nbar += 1
                        if nbar == 2:
                            return barcodes
        return barcodes

    try:
        if sam_filename == "-":
            read_seq_file = HTSeq.BAM_Reader(sys.stdin)
        else:
            read_seq_file = HTSeq.BAM_Reader(sam_filename)

        # Get template for output BAM
        if samout_filename is None:
            template = None
            samoutfile = None
        elif samout_format in ('bam', 'BAM'):
            template = read_seq_file.get_template()
            samoutfile = pysam.AlignmentFile(
                    samout_filename, 'wb',
                    template=template,
                    )
        else:
            template = None
            samoutfile = open(samout_filename, 'w')

        read_seq_iter = iter(read_seq_file)
        # Catch empty BAM files
        try:
            first_read = next(read_seq_iter)
            pe_mode = first_read.paired_end
        # FIXME: catchall can hide subtle bugs
        except:
            first_read = None
            pe_mode = False
        if first_read is not None:
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq = []
    except:
        sys.stderr.write(
            "Error occured when reading beginning of SAM/BAM file.\n")
        raise

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    try:
        if pe_mode:
            if ((supplementary_alignment_mode == 'ignore') and
               (secondary_alignment_mode == 'ignore')):
                primary_only = True
            else:
                primary_only = False
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(
                        read_seq,
                        primary_only=primary_only)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                        read_seq,
                        max_buffer_size=max_buffer_size,
                        primary_only=primary_only)
            else:
                raise ValueError("Illegal order specified.")

        # The nesting is cell barcode, UMI, feature
        counts = defaultdict(lambda: defaultdict(Counter))
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write(
                    "%d alignment record%s processed.\n" %
                    (i, "s" if not pe_mode else " pairs"))
                sys.stderr.flush()

            i += 1

            cb, ub = identify_barcodes(r)

            if not pe_mode:
                if not r.aligned:
                    counts[cb][ub]['__not_aligned'] += 1
                    write_to_samout(
                            r, "__not_aligned", samoutfile,
                            template)
                    continue
                if ((secondary_alignment_mode == 'ignore') and
                   r.not_primary_alignment):
                    continue
                if ((supplementary_alignment_mode == 'ignore') and
                   r.supplementary):
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        write_to_samout(
                                r,
                                "__alignment_not_unique",
                                samoutfile,
                                template)
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    counts[cb][ub]['__too_low_aQual'] += 1
                    write_to_samout(
                            r, "__too_low_aQual", samoutfile,
                            template)
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                              and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv)
                              for co in r.cigar if (co.type in com and
                                                    co.size > 0))
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type in com and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                                iv_seq,
                                (invert_strand(co.ref_iv) for co in r[1].cigar
                                if co.type in com and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                                iv_seq,
                                (co.ref_iv for co in r[1].cigar
                                 if co.type in com and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(
                                r, "__not_aligned", samoutfile,
                                template)
                        counts[cb][ub]['__not_aligned'] += 1
                        continue
                if secondary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].not_primary_alignment:
                        continue
                    elif (r[1] is not None) and r[1].not_primary_alignment:
                        continue
                if supplementary_alignment_mode == 'ignore':
                    if (r[0] is not None) and r[0].supplementary:
                        continue
                    elif (r[1] is not None) and r[1].supplementary:
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                       (r[1] is not None and r[1].optional_field("NH") > 1)):
                        write_to_samout(
                                r, "__alignment_not_unique", samoutfile,
                                template)
                        counts[cb][ub]['__alignment_not_unique'] += 1
                        if multimapped_mode == 'none':
                            continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual) or
                   (r[1] and r[1].aQual < minaqual)):
                    write_to_samout(
                            r, "__too_low_aQual", samoutfile,
                            template)
                    counts[cb][ub]['__too_low_aQual'] += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode in ("intersection-strict",
                                      "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if ((len(fs2) > 0) or
                               (overlap_mode == "intersection-strict")):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")

                if fs is None or len(fs) == 0:
                    write_to_samout(
                            r, "__no_feature", samoutfile,
                            template)
                    counts[cb][ub]['__no_feature'] += 1
                elif len(fs) > 1:
                    write_to_samout(
                            r, "__ambiguous[" + '+'.join(fs) + "]",
                            samoutfile,
                            template)
                    counts[cb][ub]['__ambiguous'] += 1
                else:
                    write_to_samout(
                            r, list(fs)[0], samoutfile,
                            template)

                if fs is not None and len(fs) > 0:
                    if multimapped_mode == 'none':
                        if len(fs) == 1:
                            counts[cb][ub][list(fs)[0]] += 1
                    elif multimapped_mode == 'all':
                        for fsi in list(fs):
                            counts[cb][ub][fsi] += 1
                    else:
                        sys.exit("Illegal multimap mode.")


            except UnknownChrom:
                write_to_samout(
                        r, "__no_feature", samoutfile,
                        template)
                counts[cb][ub]['__no_feature'] += 1

    except:
        sys.stderr.write(
            "Error occured when processing input (%s):\n" %
            (read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write(
            "%d %s processed.\n" %
            (i, "alignments " if not pe_mode else "alignment pairs"))
        sys.stderr.flush()

    if samoutfile is not None:
        samoutfile.close()

    # Get rid of UMI by majority rule
    cbs = sorted(counts.keys())
    counts_noumi = {}
    for cb in cbs:
        counts_cell = Counter()
        for ub, udic in counts.pop(cb).items():
            # In case of a tie, do not increment either feature
            top = udic.most_common(2)
            if (len(top) == 2) and (top[0][1] == top[1][1]):
                continue
            counts_cell[top[0][0]] += 1
        counts_noumi[cb] = counts_cell

    return {
        'cell_barcodes': cbs,
        'counts': counts_noumi,
        }
Esempio n. 30
0
def main():
    arg_parser = argparse.ArgumentParser(description='Processes a BAM file into TSV.')
    arg_parser.add_argument("input_file",type=str, help='<input file>, can be a stream indicating "-"')
    arg_parser.add_argument("-id","--min_id",type=float, default=95.0, help='Minimal %% of identity to reference sequence to gather the read. (Default = 95.0)')
    arg_parser.add_argument("-len","--min_len",type=int, default=60, help='Minimal lenght of the read to be proccessed. (Default = 60)')
    arg_parser.add_argument("-clip","--max_clip",type=float, default=0.3, help='Max clipping allowed on the alignment. (Default = 0.30)')
    arg_parser.add_argument("--out_dir",type=str, default='./', help='Folder where to store the output files.')
    arg_parser.add_argument("--mode",type=str, default='paired', help='Alignment type of the input files. (paired or single)')
    arg_parser.add_argument("--dataset",type=str, help='Custom dataset name.')
    args = arg_parser.parse_args()

    if args.input_file == '':
        print "No input file given. exiting..."
        sys.exit(1)
    elif args.input_file == '-':
        bam_file = HTSeq.SAM_Reader(sys.stdin)
        if args.dataset:
            dataset_id = args.dataset
        else:
            sys.exit("If using a stream you need to provide a name for the dataset.")
    elif args.input_file != '-':
        import os
        bam_file = HTSeq.BAM_Reader(args.input_file)
        dataset_id = os.path.basename(args.input_file)
        dataset_id = dataset_id.split('.')[0]
    elif not args.input_file:
        sys.exit("No input file given. exiting...")

    if args.min_id:
        min_id = float(args.min_id)
    if args.min_len:
        min_len = int(args.min_len)
    if args.max_clip:
        max_clip = float(args.max_clip)
    if args.out_dir:
        if args.out_dir != './':
            import os

            out_dir = str(args.out_dir) + '/'
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)
        else:
            out_dir = str(args.out_dir)
    if args.mode == 'paired':
        mode = str(args.mode)
    elif args.mode == 'single':
        mode = str(args.mode)
    else:
        sys.exit("No valid aligment type.")

    '''DF containing the raw alignments'''
    df = bam_parser_2(bam_file, min_len=min_len, max_clip=max_clip, min_id=min_id, mode=mode)

    try:
        if len(df) > 0:
            if dataset_id == '':
                dataset_id = df.ix[0]['QUERY'].split('.')[0]
    except Exception as e:
        if args.input_file != '-':
            error_msg = 'Error: No alignments in input file.' + args.input_file
            sys.exit(error_msg)
            raise

    amb_summary = None
    aligned_aln_list = list()
    amb_list = list()

    df2 = df.sort_values(by=['ALN','SCORE'], ascending=[1,0]).drop_duplicates('ALN')

    df2['MASTER_QUERY'] = df2['QUERY'].apply(get_read_name)
    gdf2 = df2.groupby('MASTER_QUERY')

    aligned_aln_list, amb_list = dupe_remover(gdf2)

    if len(aligned_aln_list) > 0:
        unique_df = pd.concat(aligned_aln_list)
    else:
        error_msg = "Error: No relevant alignments to process in " + args.input_file
        sys.exit(error_msg)
    '''If there are ambiguous reads it will write the FASTA and TSV files'''
    if len(amb_list) > 0:
        amb_df = pd.concat(amb_list)

        g_amb_df = amb_df.groupby('MASTER_QUERY')
        amb_df = g_amb_df.apply(amb_cluster)
        amb_df = amb_df.reset_index(level=0, drop=True)

        amb_df.columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT','MASTER_QUERY','AMB_STR']

        '''Counts the ambiguous reads'''
        amb_count = len(amb_df.drop_duplicates('MASTER_QUERY'))
        amb_summary = 'ambiguous\t' + str(amb_count) + '\n'
        for ref in sorted(amb_df['REF'].unique()):
            amb_count = len(amb_df.loc[amb_df['REF'] == ref])
            amb_summary += ref + '-amb\t' + str(amb_count) + '\n'

        '''FASTA file writing of ambiguously aligned reads'''
        with open(out_dir + dataset_id + '.amb.fasta','w') as fh_amb:
            ambiguous_reads = amb_df.apply(lambda x: df_2_fasta(x), axis = 1).reset_index(drop=True)
            for ambiguous_read in ambiguous_reads:
                fh_amb.write(ambiguous_read)

        output_columns = ['MASTER_QUERY','REF','SCORE','ID','AMB_STR']
        amb_df = amb_df[output_columns]

        amb_df.rename(columns={'MASTER_QUERY': 'QUERY'}, inplace=True)

        amb_df.to_csv(out_dir + dataset_id + '.amb.tsv', sep='\t', header=False, index=False)

    '''FASTA file writing'''
    with open(out_dir + dataset_id + '.fasta','w') as fh_aligned:

        aligned_reads = unique_df.apply(lambda x: df_2_fasta(x), axis = 1).reset_index(drop=True)
        for read in aligned_reads:
            fh_aligned.write(read)

    '''tsv file writing'''
    output_columns = ['QUERY','REF','SCORE','ID']
    unique_df = unique_df[output_columns]

    unique_df.to_csv(out_dir + dataset_id + '.unique_counts.tsv', sep='\t', header=False, index=False)
    '''Counts file writing'''
    with open(out_dir + dataset_id + '.counts','w') as fh_aligned_counts:
        g_unique = unique_df.groupby('REF')

        for query in sorted(unique_df['REF'].unique()):

            query_count = len(unique_df.loc[unique_df['REF'] == query])
            query_string = query + '\t' + str(query_count) + '\n'

            fh_aligned_counts.write(query_string)

        if amb_summary:
            fh_aligned_counts.write(amb_summary)