예제 #1
0
def count_reads_paired(read_seq, counter, order, quiet, minaqual):

    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments(read_seq)
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    else:
        raise ValueError("Illegal order specified.")

    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            msg = "%d SAM alignment record pairs processed.\n" % (i)
            sys.stderr.write(msg)

        i += 1
        if r[0] is not None and r[0].aligned:
            forward_iv_seq = (co.ref_iv for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
            reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                              if co.type == "M" and co.size > 0)
        else:
            forward_iv_seq = tuple()
            reverse_iv_seq = tuple()
        if r[1] is not None and r[1].aligned:
            rest = (invert_strand(co.ref_iv) for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            forward_iv_seq = itertools.chain(forward_iv_seq, rest)
            rest = (co.ref_iv for co in r[1].cigar
                    if co.type == "M" and co.size > 0)
            reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
        else:
            if (r[0] is None) or not (r[0].aligned):
                counter.not_aligned(r)
                continue
        try:
            if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                    (r[1] is not None and r[1].optional_field("NH") > 1):
                counter.non_unique(r)
                continue
        except KeyError:
            pass
        if (r[0] and r[0].aQual < minaqual) or \
                (r[1] and r[1].aQual < minaqual):
            counter.too_low_quality(r)
            continue

        counter.forward_count(forward_iv_seq, r)
        counter.reverse_count(reverse_iv_seq, r)

    if not quiet:
        sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
def count_reads_paired(read_seq, counter, order, stranded, 
      quiet, minaqual, write_to_samout ):
      
    if order == "name":
        read_seq = HTSeq.pair_SAM_alignments( read_seq )
    elif order == "pos":
        read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
    else:
        raise ValueError, "Illegal order specified."

    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if r[0] is not None and r[0].aligned:
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
        else:
            iv_seq = tuple()
        if r[1] is not None and r[1].aligned:            
            if stranded != "reverse":
                iv_seq = itertools.chain(iv_seq, 
                    ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                iv_seq = itertools.chain( iv_seq, 
                    ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
        else:
            if ( r[0] is None ) or not ( r[0].aligned ):
                write_to_samout( r, "__not_aligned" )
                counter.notaligned += 1
                continue         
        try:
            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                counter.nonunique += 1
                write_to_samout( r, "__alignment_not_unique" )
                continue
        except KeyError:
            pass
        if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
            lowqual += 1
            write_to_samout( r, "__too_low_aQual" )
            continue         
        
        counter.count(iv_seq, r)
         
    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
def parse_fastx_sam_parallel(fastx_infile, sam_infile):
    """ Parse fastx and resulting sam file in parallel - generator yielding (name, seq, alignment_list) tuples.

    The sam file may contain multiple alignments per read.  Program checks that the readnames match.
    """
    fastx_generator = basic_seq_utilities.name_seq_generator_from_fasta_fastq(fastx_infile)
    sam_generator = iter(HTSeq.bundle_multiple_alignments(HTSeq.SAM_Reader(sam_infile)))
    if_finished_fastx, if_finished_sam = False, False
    while True:
        try:                    name, seq = fastx_generator.next()
        except StopIteration:   if_finished_fastx = True
        try:                    alns = sam_generator.next()
        except StopIteration:   if_finished_sam = True
        # if both finished, good, we're doine
        if if_finished_fastx and if_finished_sam:
            raise StopIteration
        # if one file was finished but the other wasn't, error!
        elif if_finished_fastx or if_finished_sam:
            raise DeepseqError("Parsing seq/aln files in parallel - inconsistent finished states! "
                              +"(If finished: %s %s, %s %s)"%(fastx_infile, if_finished_fastx, sam_infile, if_finished_sam))
        # if all the files still contained data, yield it
        else:
            name = name.split()[0]
            name2 = alns[0].read.name.split()[0]
            if not name2 == name:
                raise DeepseqError("Non-matching readnames between files! %s in %s, %s in %s"%(fastx_infile, name, 
                                                                                               sam_infile, name2))
            yield (name, seq, alns)
예제 #4
0
def HTseq_count(bam_file, gtf_file, out_dir, identifier, parallel = True ):
	gtf_file = HTSeq.GFF_Reader(gtf_file)
	features = HTSeq.GenomicArrayOfSets( "auto", stranded=True )

	print "extracting features from gtf file"
	for feature in gtf_file:
		# if feature.type == "exon":
		features[feature.iv] += feature.attr[identifier]

	counts = collections.Counter( )

	almnt_file = HTSeq.SAM_Reader(bam_file)
	counts = collections.Counter( )
	for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ):
		if len(bundle) != 1:
			continue  # Skip multiple alignments
		first_almnt, second_almnt = bundle[0]  # extract pair
		if not first_almnt.aligned and second_almnt.aligned:
			count[ "_unmapped" ] += 1
			continue
		gene_ids = set()
		for iv, val in features[ left_almnt.iv ].steps():
			gene_ids |= val
		for iv, val in features[ right_almnt.iv ].steps():
			gene_ids |= val
		if len(gene_ids) == 1:
			gene_id = list(gene_ids)[0]
			counts[ gene_id ] += 1
		elif len(gene_ids) == 0:
			counts[ "_no_feature" ] += 1
		else:
			counts[ "_ambiguous" ] += 1

	for gene_id in counts:
		print gene_id, counts[ gene_id ]
예제 #5
0
파일: bamcount.py 프로젝트: orionzhou/robin
def bam_count(args):
    bam = HTSeq.SAM_Reader(args.fi)
    #exons = htseq_read_gtf(args.fg)
    cnts = collections.Counter()
    for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam):
        if len(bundle) != 1:
            continue
        aln1, aln2 = bundle[0]
        if not aln1.aligned and aln2.aligned:
            cnts["_unmapped"] += 1
            continue
        gids = set()
        for iv, val in exons[aln1.iv].steps():
            gids |= val
        for iv, val in exons[aln2.iv].steps():
            gids |= val
        if len(gids) == 1:
            gid = list(gids)[0]
            cnts[gid] += 1
        elif len(gids) == 0:
            cnts["_no_feature"] += 1
        else:
            cnts["_ambiguous"] += 1
    for gid in cnts:
        print("%s\t%d" % (gid, cnts[gid]))
예제 #6
0
    def listFromCIGAR(cls, cigarstring,position_b0, refname, strand):
        read_parts = []
        if strand == MINUS: # need to reverse the CIGAR
            logger.debug("Reversing CIGAR for minus strand read fragment")
            cigarstring = "".join(reversed(re.findall("\d+[MIDNSHP=X]", cigarstring)))

        op_type_list = []
        for op in HTSeq.parse_cigar(cigarstring, position_b0, refname, strand):
            logger.debug(map(str,(op, op.query_from, op.query_to, op.ref_iv)))
            if op.type == "M":
                if "M" in op_type_list:
                    if len(op_type_list) >=2 and op_type_list[-1] == "D" and op_type_list[-2] == "M":
                        logger.debug(map(str,("extending (D):", op, op.query_from, op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    elif len(op_type_list) >=2 and op_type_list[-1] == "I" and op_type_list[-2] == "M":
                        logger.debug(map(str,("extending (I):", op, op.query_from, op.query_to, op.ref_iv)))
                        read_parts[-1].extend(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    else:
                        logger.debug("CIGAR WARNING: Number of matches > 1: {0}".format(cigarstring))
                else:
                    logger.debug(map(str,("appending:", op, op.query_from, op.query_to, op.ref_iv)))
                    suppl_frag = cls(op.query_from, op.query_to,op.ref_iv.start,op.ref_iv.end,op.ref_iv.chrom,strand)
                    read_parts.append(suppl_frag)
            op_type_list.append(op.type)
        return read_parts
예제 #7
0
def set_up_IO(fileIN,fileOUT,gff,downstream,upstream):
    '''Function that will open all the file required for the alignment processing
    '''
    ## Open alignment
    alignIN = HTSeq.SAM_Reader(fileIN)
    alignIN = HTSeq.bundle_multiple_alignments(alignIN)

    ## Open GFF file
    annotation = HTSeq.GFF_Reader(gff,end_included = True)

    ## Open output file - write the header
    countTable = open(fileOUT,'w')
    coordinates = '\t'.join(i for i in map(str,range(-upstream,downstream)))
    countTable.write('name\t{coord}\n'.format(coord = coordinates))
    return alignIN, annotation, countTable
예제 #8
0
파일: BP-1.py 프로젝트: ppflrs/scripts
def bam_parser_2(bam_file, min_len, max_clip, min_id, mode):
    bam_dict = {}

    query_counter = 0

    output_list = list()

    if mode == 'paired':
        #import itertools
        #for aln in itertools.islice( HTSeq.pair_SAM_alignments(bam_file), 1000 ):  # printing first N reads
        for aln in HTSeq.pair_SAM_alignments(bam_file):
            query_counter += 1

            query_1, query_2 = aln

            q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)
            q2_aln = parser_aln_list(query_2, aln_number = query_counter, pair_pos = 2, min_len=min_len, max_clip=max_clip, min_id=min_id)

            alns = [q1_aln, q2_aln]

            if alns == [None, None]:
                continue
            else:
                if None in alns:
                    alns.remove(None)
                output_list.append(alns)

    elif mode == 'single':
        for aln in bam_file:

            query_counter += 1

            query_1 = aln

            q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)

            alns = [q1_aln]

            if q1_aln != None:
                output_list.append(alns)

    df_columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT']
    output_list = [item for sublist in output_list for item in sublist]

    return pd.DataFrame(output_list, columns=df_columns)
def ungapped_pe_counter(sam_reader, feature_array):
    counts = collections.Counter( )
    pair_iterator = hts.pair_SAM_alignments( sam_reader, bundle=True )
    # bundle puts all multiply-mapped pairs together.

    t0 = datetime.datetime.now()
    for ic, bundle in enumerate(pair_iterator):

        # report progress (to prove that it is still alive):
        if ic % 1000000 == 0:
            t1 = datetime.datetime.now()
            print "\r%d read bundles counted in %s\r" % (ic, t1-t0)
            sys.stdout.flush()

        if bundle == []: # first bundle for some reason is always an empty list
            continue

        bcounts = assess_bundle(bundle, feature_array)

        """
        To evaluate the multiply mapped bundles, each pair in a bundle must still ALWAYS
        and ONLY map to a single feature. Thus, every aligned pair has come from the same
        feature (gene), and this bundle counts as evidence of one read for this gene.

        If any of the read pairs maps to a different gene, or no gene, or multiple genes,
        then the bundle is considered ambiguous.

        If all pairs in a bundle map as _no_feature, _unmapped or _ambiguous, then the
        bundle counts as one count towards this feature type. (ie, it is passed on to
        the final counter to increment by 1).
        """

        if len(bcounts) > 1: # ie, is a multiply mapped feature with multiple gene mappings
            counts[ "_ambiguous" ] += 1
            continue
        elif len(bcounts) == 0:  # uh oh! There is an error somewhere.
            print "#" * 40
            print "Error! bundle was not assigned any status"
            print "Contents of bundle:"
            print bundle
            continue
        else:
            counts[ bcounts.keys()[0] ] += 1

    return counts
예제 #10
0
 def searchGeneName(self,annotationstring):
     if annotationstring == '.':
         genes = 'N/A'
     else:
         # Split the annotationstring by ',' which collapsed by bedtools groupby
         annotationstrings = annotationstring.split(',')
         collect = set()
         for annotation in annotationstrings:
             try:
                 attr = HTSeq.parse_GFF_attribute_string(annotation)
                 # Search for gene_name which is used by ensembl gtf annotation
                 try:
                     gene = attr['gene_name']
                 except KeyError:
                     # Search for gene, which might used in GFF annotation
                     try:
                         gene = attr['gene']
                     except KeyError:
                         # Search for gene_id
                         try:
                             gene = attr['gene_id']
                         except KeyError:
                             try:
                                 gene = attr['transcript_id']
                             except KeyError:
                                 gene = 'N/A'
             except:
                 gene = self.searchGeneName1(annotation)
             collect.add(gene)
         # Collapse all genes togethor
         if len(collect) > 1:
             try:
                 collect.remove('N/A')
             except KeyError:
                 pass
         genes = ','.join(collect)
         
     return genes
예제 #11
0
def count_reads(sam_filename, features, counts, samtype, order, forward,
                reverse, overlap_mode, quiet, minaqual, samout, directory):

    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() +
                                 "\tXF:Z:" + assignment + "\n")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if samtype is None:
        samtype = detect_sam_type(sam_filename)

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    try:
        if sam_filename != "-":
            read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading beginning "
                         "of SAM/BAM file.\n")
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "pos":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError("Illegal order specified.")
        if forward:
            empty_forward = 0
            ambiguous_forward = 0
            counts_forward = copy.copy(counts)
        if reverse:
            empty_reverse = 0
            ambiguous_reverse = 0
            counts_reverse = copy.copy(counts)
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("%d SAM alignment record%s processed.\n" %
                                 (i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "__not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue
                if forward:
                    iv_seq_for = (co.ref_iv for co in r.cigar
                                  if co.type == "M" and co.size > 0)
                if reverse:
                    iv_seq_rev = (invert_strand(co.ref_iv) for co in r.cigar
                                  if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if forward:
                        iv_seq_for = (co.ref_iv for co in r[0].cigar
                                      if co.type == "M" and co.size > 0)
                    if reverse:
                        iv_seq_rev = (invert_strand(co.ref_iv) for co in
                                      r[0].cigar if co.type == "M"
                                      and co.size > 0)
                else:
                    iv_seq_rev = tuple()
                    iv_seq_for = tuple()
                if r[1] is not None and r[1].aligned:
                    if forward:
                        iv_seq_for = (itertools.chain(iv_seq_for,
                                      (invert_strand(co.ref_iv)
                                       for co in r[1].cigar if co.type == "M"
                                       and co.size > 0)))
                    if reverse:
                        iv_seq_rev = itertools.chain(iv_seq_rev, (co.ref_iv
                                                     for co in r[1].cigar
                                                     if co.type == "M"
                                                     and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "__not_aligned")
                        notaligned += 1
                        continue
                try:
                    if ((r[0] is not None and r[0].optional_field("NH") > 1)
                            or (r[1] is not None and
                                r[1].optional_field("NH") > 1)):
                        nonunique += 1
                        write_to_samout(r, "__alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if ((r[0] and r[0].aQual < minaqual) or
                        (r[1] and r[1].aQual < minaqual)):
                    lowqual += 1
                    write_to_samout(r, "__too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    if forward:
                        fs_for = set()
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_for = fs_for.union(fs2)
                    if reverse:
                        fs_rev = set()
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs_rev = fs_rev.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                        overlap_mode == "intersection-nonempty"):
                    if forward:
                        fs_for = None
                        for iv in iv_seq_for:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_for is None:
                                        fs_for = fs2.copy()
                                    else:
                                        fs_for = fs_for.intersection(fs2)
                    if reverse:
                        fs_reverse = None
                        for iv in iv_seq_rev:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if len(fs2) > 0 or \
                                        overlap_mode == "intersection-strict":
                                    if fs_rev is None:
                                        fs_rev = fs2.copy()
                                    else:
                                        fs_rev = fs_rev.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if forward:
                    if fs_for is None or len(fs_for) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_forward += 1
                    elif len(fs_for) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_for) + "]")
                        ambiguous_forward += 1
                    else:
                        write_to_samout(r, list(fs_for)[0])
                        counts_forward[list(fs_for)[0]] += 1
                if reverse:
                    if fs_reverse is None or len(fs_rev) == 0:
                        write_to_samout(r, "__no_feature")
                        empty_reverse += 1
                    elif len(fs_reverse) > 1:
                        write_to_samout(r, "__ambiguous[" +
                                        '+'.join(fs_rev) + "]")
                        ambiguous_reverse += 1
                    else:
                        write_to_samout(r, list(fs_rev)[0])
                        counts_reverse[list(fs_rev)[0]] += 1
            except UnknownChrom:
                write_to_samout(r, "__no_feature")
                empty_forward += 1
                empty_reverse += 1

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" %
                         read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d SAM %s processed.\n" %
                         (i, "alignments "
                          if not pe_mode else "alignment pairs"))

    if samoutfile is not None:
        samoutfile.close()

    if forward:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_forward_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Forward written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_forward.keys()):
                output_file.write("%s\t%d\n" % (fn, counts_forward[fn]))
                used_features_count += 1
                used_features_sum += counts_forward[fn]
            output_file.write("__no_feature\t%d\n" % empty_forward)
            output_file.write("__ambiguous\t%d\n" % ambiguous_forward)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Forward features with alignment\t%d" % used_features_count
        print "Forward alignments asigned to feature\t%d" % used_features_sum
        print "__forward_no_feature\t%d" % empty_forward
        print "__forward_ambiguous\t%d" % ambiguous_forward
    if reverse:
        output = brenninc_utils.create_new_file(sam_filename,
                                                "_reverse_count",
                                                outputdir=directory,
                                                extension="txt",
                                                gzipped=False)
        used_features_count = 0
        used_features_sum = 0
        print "Reverse written to", output
        with open(output, "w") as output_file:
            for fn in sorted(counts_reverse.keys()):
                output.write("%s\t%d\n" % (fn, counts_reverse[fn]))
                used_features_count += 1
                used_features_sum += counts_reverse[fn]
            output_file.write("__no_feature\t%d\n" % empty_reverse)
            output_file.write("__ambiguous\t%d\n" % ambiguous_reverse)
            output_file.write("__too_low_aQual\t%d\n" % lowqual)
            output_file.write("__not_aligned\t%d\n" % notaligned)
            output_file.write("__alignment_not_unique\t%d\n" % nonunique)
        print "Reverse features with alignment\t%d" % used_features_count
        print "Reverse alignments asigned to feature\t%d" % used_features_sum
        print "__reverse_no_feature\t%d" % empty_reverse
        print "__reverse_ambiguous\t%d" % ambiguous_reverse
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
예제 #12
0
def softclipping_realignment(mapq_cutoff, max_del_len, input, output,
                             ref_genome, gtf, splice_bin):
    bwa_bam = pysam.AlignmentFile(input, 'rb')
    output_bam = pysam.AlignmentFile('{}.temp.bam'.format(output),
                                     'wb',
                                     template=bwa_bam)

    # for RNAseq
    splice_motif = ['GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT']
    try:
        fastafile = pysam.Fastafile(ref_genome)
    except IOError as e:
        print('read reference genome ' + ref_genome + ' error!', e)
        sys.exit(1)
    try:
        cvg = extract_splice_sites(gtf, splice_bin)
    except IOError as e:
        print('read GTF file ' + gtf + ' error!', e)
        sys.exit(1)

    try:
        for read in bwa_bam.fetch(until_eof=True):
            if read.mapq >= mapq_cutoff and not read.is_secondary and not read.has_tag(
                    'XA'):
                chrm = bwa_bam.getrname(read.rname)
                newcigar, newpos = detect_sv_from_cigar(
                    chrm, read, mapq_cutoff, max_del_len)
                if newcigar != 'NA' and newcigar != read.cigar:
                    old_cigarstring, old_cigar, old_pos = read.cigarstring, read.cigar, read.pos
                    read.cigar, read.pos = newcigar, newpos
                    if 'D' in read.cigarstring:
                        if read.is_reverse:
                            strand = '-'
                        else:
                            strand = '+'
                        junc_start, junc_end = read.blocks[0][1], read.blocks[
                            1][0]
                        htpos1 = HTSeq.GenomicPosition(chrm, junc_start, '.')
                        htpos2 = HTSeq.GenomicPosition(chrm, junc_end, '.')
                        if splice_checker(fastafile, chrm, junc_start,
                                          junc_end, strand):
                            read.cigar, read.pos = old_cigar, old_pos
                            read.setTag('JM', 'shift')
                            output_bam.write(read)
                            continue
                        if cvg[htpos1] > 0 or cvg[htpos2] > 0:
                            read.cigar, read.pos = old_cigar, old_pos
                            read.setTag('JM', 'GTF')
                            output_bam.write(read)
                            continue
                        m1 = fastafile.fetch(chrm, junc_start, junc_start + 2)
                        m2 = fastafile.fetch(chrm, junc_end - 2, junc_end)
                        motif = m1.upper() + m2.upper()
                        if motif in splice_motif:
                            read.cigar, read.pos = old_cigar, old_pos
                            read.setTag('JM', motif)
                            output_bam.write(read)
                            continue
                    read.setTag('OA', str(old_pos + 1) + ',' + old_cigarstring)
            output_bam.write(read)
    except ValueError as e:
        print('Bam index file is not found!', e, file=sys.stderr)
        sys.exit(1)
    bwa_bam.close()
    output_bam.close()
    try:
        subprocess.check_call(
            "samtools sort {0}.temp.bam -o {0}".format(output),
            shell=True,
            stdin=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print('Execution failed for samtools:', e, file=sys.stderr)
        sys.exit(1)

    subprocess.check_call("samtools index {}".format(output), shell=True)
예제 #13
0
#!/usr/bin/python

import HTSeq as h
from collections import defaultdict

#reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned_masked.sam")
#reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned.sam")

reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022_masked/Aligned.out.filtered.new.1017680.sam")
reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/Aligned.out.filtered.new1mb.sam")
it_p = iter(h.pair_SAM_alignments(reader))
it_p_m = iter(h.pair_SAM_alignments(reader_masked))

same_aligned = 0
one_same_pos = 0
both_same_pos = 0
masked_more_pos = 0
simple_more_pos = 0
#cur_read = {}
#cur_m_read = {}
not_in_simple = 0
not_in_masked = 0

n_m = defaultdict(list)
i = 0
for r1, r2 in h.pair_SAM_alignments(reader):
	n_m[r1.read.name].append((r1,r2))
	i += 1
	if i%10000 == 0:
		print i, " lines"
#for k,v in n_m.items():
예제 #14
0
def pool(infile, targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss):

    SI_counts = defaultdict(int)
    junction_counts = defaultdict(int)

    for f, s in HTSeq.pair_SAM_alignments_with_buffer(
            HTSeq.BAM_Reader('%s/%s.bam' % (infile, infile))):

        if f != None and f.aligned == True and f.aQual > 5:
            chrome = f.iv.chrom
            start = f.iv.start
            end = f.iv.end
            strand = f.iv.strand
            if strand == '+':
                geneint = HTSeq.GenomicPosition(chrome, start, strand)
            else:
                geneint = HTSeq.GenomicPosition(chrome, end, strand)
            if len(targets[geneint]) == 0:
                introns = set()
                junctions = set()

                for i, cigop in enumerate(f.cigar):
                    if cigop.type == 'M':
                        for iv, val in targets[cigop.ref_iv].steps():
                            introns |= val

                    elif cigop.type == 'N':
                        if f.cigar[i - 1].type == 'M' and f.cigar[
                                i - 1].size > 3 and f.cigar[
                                    i +
                                    1].type == 'M' and f.cigar[i + 1].size > 3:
                            for iv, val in targets[cigop.ref_iv].steps():
                                junctions |= val

                            chrom = cigop.ref_iv.chrom
                            if cigop.ref_iv.strand == '+':
                                first = cigop.ref_iv.end
                                second = cigop.ref_iv.start + 1
                                strand = "+"
                            else:
                                first = cigop.ref_iv.start + 1
                                second = cigop.ref_iv.end
                                strand = '-'

                            if (chrom, first,
                                    strand) in fiveSS and (chrom, second,
                                                           strand) in threeSS:
                                up = fiveSS[chrom, first, strand]
                                down = threeSS[chrom, second, strand]
                                if up[0] == down[0]:
                                    if up[1] == down[1]:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Constituitive")] += 1
                                    else:
                                        junction_counts[(infile, up[0],
                                                         int(up[1]),
                                                         int(down[1]) + 1,
                                                         "Exon Skipping")] += 1
                            elif (chrom, first, strand) in fiveSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 3'")] += 1
                            elif (chrom, second, strand) in threeSS:
                                junction_counts[(infile, up[0], int(up[1]),
                                                 int(down[1]) + 1,
                                                 "Alternative 5'")] += 1

                intron_num_mat = {}
                intron_num_pre = {}
                intron = ''
                junction = ''

                if len(introns) > 0:
                    for i in introns:
                        a = i.split(';')
                        intron_num_pre[i] = a[1]
                    intron = max(intron_num_pre.items(), key=lambda x: x[1])
                    intron = intron[0]

                if len(junctions) > 0:
                    for i in junctions:
                        a = i.split(';')
                        intron_num_mat[i] = a[1]
                    junction = max(intron_num_mat.items(), key=lambda x: x[1])
                    junction = junction[0]

                if junction == intron:
                    intron = ''
                    junction = ''

                if junction and intron:
                    if junction.split(';')[1] > intron.split(';')[1]:
                        intron = ''
                    else:
                        junction = ''

                candidate_genes = set()
                for i in introns:
                    candidate_genes.add(i.split(';')[0])
                for i in junctions:
                    candidate_genes.add(i.split(';')[0])

                if len(candidate_genes) == 1:
                    if junction:
                        SI_counts[('mature', junction)] += 1
                    if intron:
                        SI_counts[('premature', intron)] += 1
                    if f.proper_pair == True and s.proper_pair == True and s.aligned == True and s.aQual > 5:
                        if junction:
                            SI_counts[('concordant_mature', junction)] += 1
                        if intron:
                            SI_counts[('concordant_premature', intron)] += 1

# Counts starting position of read 2's that fall within specified lariat intermediate and branch to 3'SS windows
                if intron > 0 and s.aligned == True and s.proper_pair == True and s.aQual > 5:
                    chrome = s.iv.chrom
                    start = s.iv.start
                    end = s.iv.end
                    strand = s.iv.strand
                    if strand == '+':
                        geneint = HTSeq.GenomicPosition(chrome, start, strand)
                    else:
                        geneint = HTSeq.GenomicPosition(chrome, end, strand)
                    if intron in Branches[geneint] and len(
                            Branches[geneint]) == 1:
                        SI_counts[('lariat_int', intron)] += 1
                    if intron in Branchto3ss[geneint] and len(
                            Branchto3ss[geneint]) == 1:
                        SI_counts[('branch_to3ss', intron)] += 1

    with open('%s/%s_splicing_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' % (intron, SI_counts[('mature', intron)],
                                        SI_counts[('premature', intron)]))

    with open('%s/%s_concordant_splicing_counts.txt' % (infile, infile),
              'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\t%d\n' %
                      (intron, SI_counts[('concordant_mature', intron)],
                       SI_counts[('concordant_premature', intron)]))

    with open('%s/%s_lariat_int_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' % (intron, SI_counts[('lariat_int', intron)]))

    with open('%s/%s_branch_to3ss_counts.txt' % (infile, infile), 'w') as out:
        for intron in sorted(intron_set):
            out.write('%s\t%d\n' %
                      (intron, SI_counts[('branch_to3ss', intron)]))

    with open('%s/%s_junction_counts.txt' % (infile, infile), 'w') as out:
        out.write('Gene\tUpstream\tDownstream\tType\tCount\n')
        for junc in sorted(junction_counts):
            out.write(
                '%s\t%d\t%d\t%s\t%d\n' %
                (junc[1], junc[2], junc[3], junc[4], junction_counts[junc]))
예제 #15
0
# Verify taxids against nodes.dmp
taxids = set()
with open('nodes.dmp') as nodes:
    for line in nodes:
        data = line.split("|")
        taxids.add(data[0].strip())

# Map mmp_id from downloaded sequences to lineages and write to stdout in Kaiju preferred format.
processed_mmp = set()
warnings = defaultdict(int)
for root, dirs, files in os.walk('genomes/'):
    for filename in files:
        if filename.endswith(".faa"):
            counter = 1
            for seq in HTSeq.FastaReader(os.path.join(root, filename)):
                mmp_id = seq.name.split("_")
                mmp_id = mmp_id[-1]

                # If this mmp accession is a duplicate (already processed), skip it.
                if mmp_id in processed_mmp:
                    warnings['duplicate'] += 1
                    break
                # If this mmp accession has no taxonomic linage for some reason, skip it.
                if not mapdict[mmp_id]:
                    warnings['nolineage'] += 1
                    break
                # If taxonomic id is not in nodes.dmp, skip.
                if not mapdict[mmp_id] in taxids:
                    warnings['notax'] += 1
                    break
예제 #16
0
파일: snp_parser.py 프로젝트: sadikmu/mgkit
def parse_vcf(vcf_file,
              snp_data,
              min_reads,
              min_af,
              min_qual,
              annotations,
              seqs,
              options,
              line_num=100000):
    """
    Parse VCF file counts synonymous and non-synonymous SNPs

    :param file vcf_file: file handle to a VCF file
    :param dict snp_data: dictionary from :func:`init_count_set` with per
        sample SNPs information
    :param int min_reads: minimum number of reads to accept a SNP
    :param float min_af: minimum allele frequency to accept a SNP
    :param int min_qual: minimum quality (Phred score) to accept a SNP
    :param dict annotations: annotations grouped by their reference sequence
    :param dict seqs: reference sequences
    :param int line_num: the interval in number of lines at which progress
        will be printed
    """
    vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file))

    vcf_handle.parse_meta()
    vcf_handle.make_info_dict()

    # total number of SNPs accepted
    count_tot = 0
    # number of SNPs skipped for low depth
    skip_dp = 0
    # number of SNPs skipped for low allele frequency
    skip_af = 0
    # number of SNPs skipped for low quality
    skip_qual = 0
    # indels
    skip_indels = 0

    for vcf_record in vcf_handle:
        # the SNP is a sequence with no annotations
        if vcf_record.chrom not in annotations:
            continue

        if float(vcf_record.qual) < min_qual:
            # low quality SNP
            skip_qual += 1
            continue

        # unpack info records (needed for vcf_record.info to be a dictionary)
        vcf_record.unpack_info(vcf_handle.infodict)

        if vcf_record.info['INDEL']:
            skip_indels += 1
            continue

        if not isinstance(vcf_record.info['DP'], int):
            LOG.warning(vcf_record.info['DP'])

        if vcf_record.info['DP'] < min_reads:
            # not enough reads (depth) for the SNP
            skip_dp += 1
            continue

        # Samtools mpileup -> bcftools call doesn't output the allele freq.
        # it can be calculated with AC/AN for each ALT nucleotide
        # checked on bfctools (roh command) manual
        # https://samtools.github.io/bcftools/bcftools.html
        try:
            allele_freqs = vcf_record.info['AF']
        except KeyError:
            if isinstance(vcf_record.info['AC'], list):
                allele_freqs = [
                    AC / vcf_record.info['AN'] for AC in vcf_record.info['AC']
                ]
            else:
                allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN']

        # if the allele frequency is a single value, make it a list, so
        # the iteration below works anyway
        if isinstance(allele_freqs, float):
            allele_freqs = [allele_freqs]

        # alt is the nucleotidic change
        iter_data = zip(allele_freqs, vcf_record.alt)
        for alt_index, (allele_freq, change) in enumerate(iter_data):
            if allele_freq < min_af:
                # the allele frequency for the SNP is too low, it'll be
                # skipped
                skip_af += 1
                continue

            # the samples that contain the SNP is a string separated by '-'
            if options.bcftools_vcf:
                samples = set()
                for sample_id, sample_info in vcf_record.samples.items():
                    # prepare the genotype list, to make the comparison easier
                    # the genotype separator to '/' only, to use only one
                    # type of split
                    sample_info_gt = sample_info['GT'].replace('|', '/')
                    sample_info_gt = sample_info_gt.split('/')
                    for genotype in sample_info_gt:
                        if genotype == '.':
                            continue
                        if int(genotype) == (alt_index + 1):
                            samples.add(sample_id)
            else:
                samples = [
                    sample for sample in vcf_record.info['set'].split('-')
                ]
            check_snp_in_set(samples, snp_data, vcf_record.pos.start, change,
                             annotations[vcf_record.chrom],
                             seqs[vcf_record.chrom])
            # increase the total number of snps available
            count_tot += 1

        if vcf_handle.line_no % line_num == 0:
            LOG.info(
                "Line %d, SNPs passed %d; skipped for: qual %d, " +
                "depth %d, freq %d, indels %d", vcf_handle.line_no, count_tot,
                skip_qual, skip_dp, skip_af, skip_indels)
예제 #17
0
def count_reads_in_features( sam_filename, gff_filename, stranded, overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, \
							 filename_read_names_gene_names,filename_read_names_gene_names_amb_unique):
   """
	Main function to count reads in features i.e. genes. 
	
	Input:
		+ sam_filename: Input alignment with all the ambiguously mapped reads
		+ gff_filename: GTF containing all genes for a given species
		+ stranded: specify whether data are stranded - see -s option
		+ overlap_mode: mode to handle reads overlapping more than one feature (e.g. union) - 
		  See -m option: choices = ( "union", "intersection-strict", "intersection-nonempty")
		+ feature_type: see -t option
		+ id_attribute: see -i option
		+ quiet: see -q option
		+ minaqual: see -a option 
		+ samout: SAM output file storing disambiguated reads (see -o option).
		+ filename_read_names_gene_names: filename for the output file containing the mappings readName to geneNames for multimapped reads
		+ filename_read_names_gene_names_amb_unique: filename for the output file containing the mappings readName to geneNames for ambiguously mapped reads
      
	Output:
		+ Writes readName to geneName outputs.
		+ Writes SAM output file for ddisambiguated uniquely mapped reads.
		+ Writes to stdout the genes and their read counts with read count for distinct read type: non-ambiguous unique, multimapped and ambiguous unique. 
		  This output redirected and stored to an output file in main peakRescue pipeline. 
		  This output is used in the later stage of the peakRescue pipeline to rescue the reads present in the readName to genNames mappings.
	
   """
   # Output filhandles for readName to geneNames mappings
   fh_read_names_gene_names = open(filename_read_names_gene_names, 'w')
   fh_read_names_gene_names_amb_unique = open(filename_read_names_gene_names_amb_unique, 'w')
   
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   ## Hash table to store unique reads per exon (if modified GTF)
   counts = {}
   ## Hash table to store original non unique reads per gene (without 
   dict_nonunique = {}
   ## Hash table to store all unique reads as per original GTF
   dict_gene_unique_counts = {}
   ## hast table to store ambigouous read count for unique reads...
   dict_gene_unique_counts_ambiguous = {}
   ## Hash table to store all non-unique reads including shared reads 
   ## (either split reads or read pair matching on two distinct exons, same gene)
   dict_gene_nonunique_counts = {}
   ## Hash to store the non-unique read-names as key and genes names as values (fragments)
   dict_read_name_genes_names = {}
   ## Hash to store the non-unique read-names as key and genes names as values (fragments) including instances of a given multimapped read on same gene
   dict_read_name_genes_names_final = {} 
   dict_read_name_genes_names_ambiguous = {}
   ## @todo: tag_gff - parameter to be removed - only deal with gene level information 
   ## tag_gff: type to specify whether it contains gene or exons information 
   tag_gff = "gene_gff" 
   # Try to open samfile and fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close() 
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   exons = HTSeq.GenomicArrayOfSets( "auto", stranded=False )
   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
	    exons[ f.iv ] += f # added to get exon interval data
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
	    # -- Initialisation 
	    feature_name = f.attr[ id_attribute ]
	    # -- Added tag_gff for GFF type
	    if tag_gff == "gene_gff":
		# Original GTF (genes) 
		dict_nonunique = initialise_counts_per_feature(dict_nonunique, feature_name)
		dict_gene_unique_counts = initialise_counts_per_feature(dict_gene_unique_counts, feature_name)
		dict_gene_nonunique_counts = initialise_counts_per_feature(dict_gene_nonunique_counts, feature_name)
		dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(dict_gene_unique_counts_ambiguous, feature_name)
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
      #pe_mode = 1 ## Added by us
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   ###################################################################################################   
   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      ambiguous_tag=0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      nonunique_nonamb_to_be_rescued = 0
      temp_read_name="NA"
      previous_read_name="NA"
      temp_interval_r0="NA"
      temp_interval_r1="NA"
      counter_fragment = 0	
      flag_result = 0
      i = 0   
      pe_mode_for_SE = 0
      ## -- Added pe_mode on for SE files so that multireads reads will be accounted for
      if not pe_mode: # real SE
      	pe_mode_for_SE = 1 #
      	read_seq_pe_file = read_seq
      	pe_mode=1
      ## -- End
      index_fragment = 0
      for r in read_seq:
         prev_index_fragment = index_fragment
	 tag_nonunique_NH = 0
	 tag_overlapping_genes = 0
	 flag_aln_not_unique = 0 #
	 flag_ambiguous = 0 #
	 #-- LOOP OVER ALL READS IN INPUT BAM FILE
	 if pe_mode_for_SE:
	 	r = (r, None)
      	 counter_fragment += 1	
         i += 1
         if not pe_mode:
	    # -- SINGLE_END mode
            if not r.aligned:
               notaligned += 1
               #write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
		  # --- Rescue multimappers in singel-end mode
                  #write_to_samout( r, "alignment_not_unique" )
                  #nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               #write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
	    # -- PAIRED-END
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  #write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
	       	  tag_nonunique_NH = 1
               	  if ( r[0] is not None and r[1] is None ):
			result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag, exons)
			if result:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
												temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes) != 0:
					(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
													temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
               	  if ( r[0] is None and r[1] is not None ):
			result, fs_genes, fs_exons,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons)
			if result:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
												temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes) != 0:
					(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
													temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
               	  if ( r[0] is not None and r[1] is not None ):
			result1, fs_genes1, fs_exons1,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[0], features,dict_read_name_genes_names,ambiguous_tag,exons)
			result2, fs_genes2, fs_exons2,dict_read_name_genes_names,ambiguous_tag = is_read_in_gene_interval(r[1], features,dict_read_name_genes_names,ambiguous_tag,exons)

		        if len(fs_genes1.intersection(fs_genes2)) > 0:
				fs_genes = fs_genes1.intersection(fs_genes2)
		        elif len(fs_genes1.intersection(fs_genes2))==0:
				fs_genes = fs_genes1.union(fs_genes2)

			if result1 and not result2:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[0], \
												temp_interval_r0, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			elif result2 and not result1:
				flag_result = 1
				(dict_nonunique, flag_aln_not_unique, dict_read_name_genes_names) = _add_non_unique_counts_per_feature_based_on_read_interval_and_readname(r[1], \
												temp_interval_r1, temp_read_name, fs_genes, dict_nonunique, dict_read_name_genes_names)
			else:
				if len(fs_genes1) != 0 or len(fs_genes2) != 0:
					flag_result = 1
					if ( ( ((temp_interval_r0 != str(r[0].iv)) or (temp_interval_r1 != str(r[1].iv))) or (temp_read_name != r[0].read.name) ) ):
						(dict_nonunique)= add_non_unique_counts_per_feature(fs_genes, dict_nonunique)
						dict_read_name_genes_names = _populate_read_name_gene_name(dict_read_name_genes_names, fs_genes, r[0].read.name, tag_report_instances_same_multiread_on_same_gene)
						flag_aln_not_unique = 1
                  #write_to_samout( r, "alignment_not_unique" )
	          nonunique += 1

		  if flag_result:
			
			  if r[0] is not None and r[1] is None:		
				non_uniq_read_name = r[0].read.name
			  elif r[0] is None and r[1] is not None:		
				non_uniq_read_name = r[1].read.name
			  elif r[0] is not None and r[1] is not None:		
				non_uniq_read_name= r[0].read.name
			  non_uniq_read_name2 = dict_read_name_genes_names.keys()[0]
			  if flag_aln_not_unique:
				nonunique_nonamb_to_be_rescued += 1
	          	  # -- Re-initialise hash
			  # previous_read_name: read which falls into at least one gene interval
			  # tmp_read_name: the previous read in the bam file
			  # BAM is sorted by read name hence each multimapper will be arranged one after another 
			  if previous_read_name == "NA":
				previous_read_name = non_uniq_read_name

		  	  if non_uniq_read_name != previous_read_name:
				if previous_read_name in dict_read_name_genes_names.keys():
					fs_genes_names = dict_read_name_genes_names[previous_read_name]
					fh_read_names_gene_names.write("%s\t%s\n" % (previous_read_name, "\t".join(list(fs_genes_names)) ))
				previous_read_name = non_uniq_read_name
				tmp_dict = {}
				if non_uniq_read_name in dict_read_name_genes_names.keys():
					#print "non_uniq_read_name IN dict_read_name_genes_names.keys()"
					tmp_dict[non_uniq_read_name] = dict_read_name_genes_names[non_uniq_read_name]
				dict_read_name_genes_names.clear() # only one read stored
				dict_read_name_genes_names = tmp_dict	

		  flag_result = 0
		  flag_aln_not_unique = 0 #
		  (temp_read_name, temp_interval_r0, temp_interval_r1) = initalize_read_name_and_interval(r[0], r[1]) 
		  continue
            # except KeyError:
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               #write_to_samout( r, "too_low_aQual" )
               continue         
          
         try:
	    # --
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq: # interval from bam file for each fragment
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
		     	#if debug:
				#print "****Unique_feature %s and feature_interval %s" %(fs2,iv2)	
		        fs = fs.union( fs2 )
			
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )

	    fs_genes = fs
            if fs_genes is None or len( fs_genes ) == 0:
               #write_to_samout( r, "no_feature" )
               empty += 1
		# ambiguous read count and/or one of the read pair mapping on different gene (potential gene fusion events)...
		# elif len( fs ) > 1:
            elif len( fs_genes ) > 1:
	       ###############################################################
	       ## AMBIGUOUS UNIQUE
	       ###############################################################
	       is_disambiguated = 0
	       if not tag_nonunique_NH:
                  if ( r[0] is not None and r[1] is None ):
			       result, fs_genes, fs_exons,dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if result:
		       			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       if ambiguous_tag:
			       		(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					# write in the file ambiguous read name gene name data...
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))
                  if ( r[0] is None and r[1] is not None ):
			       result, fs_genes, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if result:
		       			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       if ambiguous_tag:
			       		(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[1].read.name, "\t".join(list(fs_genes)) ))
                  if ( r[0] is not None and r[1] is not None ):
			       result1, fs_genes1, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag1 = is_read_in_gene_interval(r[0], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       result2, fs_genes2, fs_exons, dict_read_name_genes_names_ambiguous, ambiguous_tag2 = is_read_in_gene_interval(r[1], features, dict_read_name_genes_names_ambiguous, ambiguous_tag,exons)
			       if debug:
			       		print "IN UNIQUE DISAMBIGUATION -->r[0].read.name=%s\t%s\t%s\t%s\t%s\n" % (r[0].read.name,result1, result2, fs_genes1, fs_genes2)
			       if len(fs_genes1.intersection(fs_genes2))==1:
					fs_genes = fs_genes1.intersection(fs_genes2)
					(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       				is_disambiguated = 1
			       elif len(fs_genes1.intersection(fs_genes2)) > 1:
					fs_genes = fs_genes1.intersection(fs_genes2)
					(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
					flag_ambiguous = 1
					fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))
			       elif len(fs_genes1.intersection(fs_genes2))==0:
					fs_genes = fs_genes1.union(fs_genes2)
					if (fs_genes1 == set([]) or fs_genes2 == set([])) and len(fs_genes) == 1: 					
						## Disambiguate the uniquely mapped to the single gene it maps on
						(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
	       					is_disambiguated = 1
					elif (fs_genes1 != set([]) or fs_genes2 != set([])):
						## Add fragment to the RN-GN for ambiguous uniquely mapped based on 
						## union of both fs_genes (fs_genes1 & fs_genes2) > 1
						(dict_gene_unique_counts_ambiguous) = add_unique_counts_per_feature_ambiguous(fs_genes, dict_gene_unique_counts_ambiguous)
						flag_ambiguous = 1
						fh_read_names_gene_names_amb_unique.write("%s\t%s\n" % (r[0].read.name, "\t".join(list(fs_genes)) ))

	       if flag_ambiguous:
			ambiguous += 1
			#write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               if is_disambiguated:
			write_to_samout( r, list(fs_genes)[0] )
            else:
	       if debug:
		       #print "DEBUG::CR:: len(fs) <-> 1:: fs = %s" %fs
			pass
               write_to_samout( r, list(fs)[0] )

               rr2 = r[0] if r[0] is not None else r[1]

	       if not tag_nonunique_NH:
			(dict_gene_unique_counts) = add_unique_counts_per_feature(dict_gene_unique_counts, fs_genes)
			
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

	 flag_ambiguous = 0 ## re-initialise....
	 index_fragment += 1
      #########################
      # This is to store the last read/fragment since it will no pass in previous condition:
      # => if non_uniq_read_name != previous_read_name:
      # -- At same level as the for loop (outside of the for loop) - column: 7
      #fh_read_names_gene_names.close()
      if dict_read_name_genes_names.keys() != []:
	#print "dict_read_name_genes_names passing"
	non_uniq_read_name = dict_read_name_genes_names.keys()[0]
	fs_genes_names = dict_read_name_genes_names[non_uniq_read_name]
	fh_read_names_gene_names.write("%s\t%s\n" % (non_uniq_read_name, "\t".join(list(fs_genes_names)) ))
      # -- 
      fh_read_names_gene_names.close() 
      fh_read_names_gene_names_amb_unique.close()
   ###################################################################################################   
   #except UnboundLocalError:
   except AttributeError:
   #except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   if tag_gff == "gene_gff":
	   tuples_genenames_exontag = [(fn, fn) for fn in dict_gene_unique_counts.keys()]
   tuples_genenames_exontag.sort()

   previous_gene_name = "NA"

   for gene_name, fn in tuples_genenames_exontag:
	gene_name = gene_name.strip()
	fn = fn.strip()
	
   	if tag_gff == "gene_gff": #
		if gene_name in dict_gene_unique_counts.keys():
			print "%s\t%i\t%i\t%s" % ( fn, dict_gene_unique_counts[gene_name], dict_nonunique[gene_name],dict_gene_unique_counts_ambiguous[gene_name] )
		else:
			# -- No non-unique reads for that gene_name
			print "%s\t%i\t%i\t%i" % ( fn, dict_gene_unique_counts[gene_name], 0,dict_gene_unique_counts_ambiguous[gene_name] )
		
	# -- Re-initialise gene name
	previous_gene_name = gene_name
			
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
   print "nonunique_nonamb_to_be_rescued:\t%d"  % nonunique_nonamb_to_be_rescued
def sciRNA_count_parallel(gtf_file, input_folder, sample_ID, core_number):
    # read in the gtf file, and then construct the genome interval for exons, genes, and gene end dictionary
    gtf_file = HTSeq.GFF_Reader(gtf_file, end_included=True)
    gene_annotat_file = input_folder + "/gene_name_annotate.txt"
    cell_annotat_file = input_folder + "/cell_annotate.txt"
    report_annotate_file = input_folder + "/report_annotate.txt"

    gene_annotat = open(gene_annotat_file, "w")
    cell_annotat = open(cell_annotat_file, "w")
    report_annotate = open(report_annotate_file, "w")

    exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    genes = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    gene_end = {}
    exon_n = 0
    gene_n = 0
    transcript_n = 0
    gene_count = 0
    print("Start generating exon genomic arrays....")
    print("Start generating gene genomic arrays....")
    print("Start calculating transcript end of genes....")

    for feature in gtf_file:
        if feature.type == "exon":
            exon_n += 1
            exons[feature.iv] += feature.attr["gene_id"]
        elif feature.type == "gene":
            gene_n += 1
            genes[feature.iv] += feature.attr["gene_id"]
            gene_count += 1

            # for human and mouse gtf file
            message = (feature.attr["gene_id"] + "," +
                       feature.attr["gene_biotype"] + "," + "exon" + "," +
                       feature.attr["gene_name"] + "," + str(gene_count) +
                       "\n")

            gene_annotat.write(message)

            gene_count += 1

            # for human and mouse gtf file
            message = (feature.attr["gene_id"] + "_intron" + "," +
                       feature.attr["gene_biotype"] + "," + "intron" + "," +
                       feature.attr["gene_name"] + "_intron" + "," +
                       str(gene_count) + "\n")
            gene_annotat.write(message)

        elif feature.type == "transcript":
            transcript_n += 1
            #print "feature gene name: ", feature.attr["gene_id"]
            if feature.attr["gene_id"] in gene_end.keys():
                gene_end[feature.attr["gene_id"]].add(feature.iv.end_d)
            else:
                gene_end[feature.attr["gene_id"]] = set()
                gene_end[feature.attr["gene_id"]].add(feature.iv.end_d)

    print("Detected gene number: ", gene_n)
    print("Detected transcript number: ", transcript_n)
    print("Detected exon number: ", exon_n)

    gene_annotat.close()

    gene_annotat = pd.read_csv(gene_annotat_file, header=None)
    gene_annotat.index = gene_annotat[0]
    #print("print WBGENE id:", gene_annotat.loc["WBGene00004947", 4])
    #print("Print transcript end, ", len(gene_end))

    sample_ID = list(pd.read_csv(sample_ID, header=None)[0])

    # generate the cell ID annotate file
    cell_count = 0
    for i in sample_ID:
        cell_count += 1
        message = i + "," + str(cell_count) + "\n"
        cell_annotat.write(message)
    cell_annotat.close()

    # Generate the report annotate file
    report_annotate.write("1, Perfect intersect exon match\n")
    report_annotate.write("2, Nearest intersect exon match\n")
    report_annotate.write("3, Perfect combine exon match\n")
    report_annotate.write("4, Nearest combine exon match\n")
    report_annotate.write("5, Perfect intersect gene match\n")
    report_annotate.write("6, Nearest intersect gene match\n")
    report_annotate.write("7, Perfect combine gene match\n")
    report_annotate.write("8, Nearest combine gene match\n")
    report_annotate.write("9, ambiguous match for exons\n")
    report_annotate.write("10, ambiguous match for genes\n")
    report_annotate.write("11, No match\n")
    report_annotate.close()

    # parallele for the functions
    p = Pool(processes=int(core_number))
    #print("Processing core number: ", core_number)
    func = partial(sciRNAseq_count,
                   input_folder=input_folder,
                   exons=exons,
                   genes=genes,
                   gene_end=gene_end,
                   gene_annotat=gene_annotat,
                   sample_ID=sample_ID)
    #sciRNAseq_count(sample, input_folder, exons, genes, gene_end)
    result = p.map(func, sample_ID)
    p.close()
    p.join()

    print("All analysis done~")
예제 #19
0
def count_reads_in_features(sam_filename, gff_filename, samtype, order, overlap_mode,
    feature_type, id_attribute, quiet, minaqual, mapping_file, scale_method):

    features = HTSeq.GenomicArrayOfSets("auto", False)
    counts = {}

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    # Try to open mapping file to fail early in case it is not there
    if mapping_file:
        open(mapping_file).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    continue
                features[f.iv] += feature_id
                counts[feature_id] = 0
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} GFF lines processed.\n".format(i))
    except:
        sys.stderr.write("Error occured when processing GFF file ({}):\n"
            .format(gff.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} GFF lines processed.\n".format(i))

    num_features = len(counts)
    if num_features == 0:
        sys.stderr.write("Warning: No features of type '{}' found.\n"
            .format(feature_type))

    if samtype == "sam":
        align_reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        align_reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format {} specified.".format(samtype)

    try:
        if sam_filename != "-":
            read_seq_file = align_reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = align_reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading SAM/BAM file.\n" )
        raise

    try:
        if pe_mode:
            if order == "name":
                read_seq = HTSeq.pair_SAM_alignments(read_seq)
            elif order == "position":
                read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
            else:
                raise ValueError, "Illegal order specified."
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            if i > 0 and i % 100000 == 0 and not quiet:
                sys.stderr.write("{!s} SAM alignment record{} processed.\n"
                    .format(i, "s" if not pe_mode else " pairs"))

            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    iv_seq = itertools.chain( iv_seq,
                        (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1 ) or \
                            (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                         if iv.chrom not in features.chrom_vectors:
                             raise UnknownChrom
                         for iv2, fs2 in features[iv].steps():
                             fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[ iv ].steps():
                            if len(fs2) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                empty += 1

    except:
        sys.stderr.write("Error occured when processing SAM input ({}):\n"
            .format(read_seq_file.get_line_number_string()))
        raise

    if not quiet:
        sys.stderr.write("{!s} SAM {} processed.\n"
            .format(i, "alignments " if not pe_mode else "alignment pairs"))

    # map to higher order features if applicable
    if mapping_file:
        abundances = {}
        with open(mapping_file) as mapping_h:
            for row in csv.reader(mapping_h, delimiter='\t'):
                try:
                    feature, feature_category, feature_length, organism = row
                except ValueError:
                    sys.stderr.write("Can't determine the format of '{}'".format(mapping_file))
                    raise
                if feature not in counts:
                    continue
                if not feature_category:
                    feature_category = feature
                abund = counts[feature] if scale_method == 'none' else scale_abundance(counts[feature], int(feature_length))
                if ',' in feature_category:
                    cats = feature_category.split(',')
                    for category in cats:
                        abundances[category] = abundances.get(category, 0) + abund
                else:
                    abundances[feature_category] = abundances.get(feature_category, 0) + abund

        if num_features > 0 and len(abundances) == 0:
            sys.stderr.write("Warning: No higher order features found. Please "
                "make sure the mapping file is formatted correctly.\n")

        for feature in counts:
            if feature not in abundances:
                abundances['UNMAPPED'] = abundances.get('UNMAPPED', 0) + counts[feature]

    else:
        abundances = counts

    # "UNMAPPED" can be interpreted as a single unknown gene of length 1
    # kilobase recruiting all reads that failed to map to known sequences
    abundances['UNMAPPED'] = (abundances.get('UNMAPPED', 0) + empty + ambiguous + lowqual + notaligned + nonunique)

    for fn in sorted(abundances.keys()):
        print("{}\t{!s}".format(fn, abundances[fn]))
    sys.stderr.write("__no_feature\t{!s}\n".format(empty))
    sys.stderr.write("__ambiguous\t{!s}\n".format(ambiguous))
    sys.stderr.write("__too_low_aQual\t{!s}\n".format(lowqual))
    sys.stderr.write("__not_aligned\t{!s}\n".format(notaligned))
    sys.stderr.write("__alignment_not_unique\t{!s}\n".format(nonunique))
예제 #20
0
def count_reads_in_features( sam_filename, gff_filename, stranded,
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, custom_stat ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() +
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" )
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None

   # MB
   if custom_stat != "":
      custom_stat_file=open(custom_stat,"a")
   else:
      custom_stat_file = None
   # endMB
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" %
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." %
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      # MB: Creating detailed stats
      if custom_stat_file:
		  sam_lines = 0
		  skipped = 0
		  assigned_reads = 0
		  assigned_reads_s = 0
		  assigned_reads_p = 0
		  assigned_genes = 0
		  assigned_genes_s = 0
		  assigned_genes_p = 0
		  empty_s = 0
		  empty_p = 0
		  ambiguous_s = 0
		  ambiguous_p = 0
		  anu_dict = {}
      # endMB
      i = 0
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )
         else:
            if r[0] is not None and r[0].aligned:
               #for co in r[0].cigar:
                  #sys.stderr.write("ID: %s, %s\n" % (r[0].original_sam_line.split('\t')[0],co.ref_iv))
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq,
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq,
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  # MB: Counting the 'alignment_not_unique' for one or both mates
                  if custom_stat_file:
					  if r[0] is not None and r[1] is not None: # The 2 mates are mapped
						 read_id = r[0].original_sam_line.split('\t')[0]
						 if read_id not in anu_dict: # The read is not indexed yet
							anu_dict[read_id] = {}
							anu_dict[read_id]['chr1'] = r[0].original_sam_line.split('\t')[2]
							anu_dict[read_id]['chr2'] = r[1].original_sam_line.split('\t')[2]
							anu_dict[read_id]['start1'] = r[0].original_sam_line.split('\t')[3]
							anu_dict[read_id]['start2'] = r[1].original_sam_line.split('\t')[3]
							anu_dict[read_id]['al_unique1'] = True
							anu_dict[read_id]['al_unique2'] = True
						 else: # Read already indexed
							if anu_dict[read_id]['al_unique1']:
							   if anu_dict[read_id]['chr1'] != r[0].original_sam_line.split('\t')[2] or anu_dict[read_id]['start1'] != r[0].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[0]
								  anu_dict[read_id]['al_unique1'] = False
							if anu_dict[read_id]['al_unique2']:
							   if anu_dict[read_id]['chr2'] != r[1].original_sam_line.split('\t')[2] or anu_dict[read_id]['start2'] != r[1].original_sam_line.split('\t')[3]: # At least two positions exists for mate r[1]
								  anu_dict[read_id]['al_unique2'] = False
					  elif r[0] is not None: # Only r[1] is mapped
					     anu_dict[r[0].original_sam_line.split('\t')[0]] = {}
					     anu_dict[r[0].original_sam_line.split('\t')[0]]['al_unique1'] = False
					  else: # Only r[0] is mapped
					     anu_dict[r[1].original_sam_line.split('\t')[0]] = {}
					     anu_dict[r[1].original_sam_line.split('\t')[0]]['al_unique2'] = False
                  # endMB
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
               # MB
               if custom_stat_file:
                  if r[0] is not None and r[1] is not None:
                     empty_p += 1
                  else:
                     empty_s += 1
               # endMB
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
               # MB
               if custom_stat_file:
                  if r[0] is not None and r[1] is not None:
                     ambiguous_p += 1
                  else:
                     ambiguous_s += 1
               # endMB
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
               # MB
               if custom_stat_file:
                  if counts[ list(fs)[0] ] == 1:
                     assigned_genes += 1
                  assigned_reads += 1
                  if r[0] is not None and r[1] is not None:
                     assigned_reads_p += 1
                  else:
                     assigned_reads_s += 1
               # endMB
         except UnknownChrom:
            if not pe_mode:
               rr = r
            else:
               rr = r[0] if r[0] is not None else r[1]
            # MB
            if custom_stat_file:
               skipped += 1
            #endMB
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
   
   # MB: Adding stats in the custom_stat file
   if custom_stat_file:
      custom_stat_file.write("Input SAM file line count\t"+"{:,}".format(sum(1 for line in open(sam_filename) if not line.startswith('@')))+"\n\n")
      custom_stat_file.write("SAM lines (pairs or singles) processed\t"+"{:,}".format(i)+"\n\n")
      custom_stat_file.write("Skipped pairs (chr.not found)\t"+"{:,}".format(skipped)+"\n\n")
      custom_stat_file.write("Assigned_genes\t"+"{:,}".format(assigned_genes)+"\n\n")
      custom_stat_file.write("Assigned_reads\t"+"{:,}".format(assigned_reads)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(assigned_reads_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(assigned_reads_p)+"\n\n")
      custom_stat_file.write("No_features\t"+"{:,}".format(empty)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(empty_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(empty_p)+"\n\n")
      custom_stat_file.write("Ambiguous\t"+"{:,}".format(ambiguous)+"\n")
      custom_stat_file.write("\tSingle reads\t"+"{:,}".format(ambiguous_s)+"\n")
      custom_stat_file.write("\tPaired reads\t"+"{:,}".format(ambiguous_p)+"\n\n")
      custom_stat_file.write("Alignment_not_unique\t"+"{:,}".format(nonunique)+"\n")
      custom_stat_file.write("\tSAM lines (pairs or singles)\t"+"{:,}".format(len(anu_dict))+"\n")
      # Counting the 'alignment_not_unique' with one or both mates multiply aligned
      simpl = 0
      multipl = 0
      for i in anu_dict:
         if 'al_unique1' in anu_dict[i] and 'al_unique2' in anu_dict[i]:
            if anu_dict[i]['al_unique1'] or anu_dict[i]['al_unique2']:
               simpl += 1
            else:
               multipl += 1
         else:
            multipl += 1
      custom_stat_file.write("\tOne_mate_uniquely_mapped\t"+"{:,}".format(simpl)+"\n")
      custom_stat_file.write("\tTwo_mates_multiply_mapped\t"+"{:,}".format(multipl)+"\n")
예제 #21
0
    def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ):
        """ Hacked version of htseq count.py
        """
        if opts.quiet:
            warnings.filterwarnings( action="ignore", module="HTSeq" )
        features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" )
        mapqMin = int(opts.mapqMin)
        counts = {}
        nreads = 0
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        filtered = 0  # new filter_extras - need a better way to do this - independent filter tool?
        gff = HTSeq.GFF_Reader( gff_filename )
        try:
            for i, f in enumerate(gff):
                if f.type == opts.feature_type:
                    try:
                        feature_id = f.attr[ opts.id_attribute ]
                    except KeyError:
                        try:
                            feature_id = f.attr[ 'gene_id' ]
                        except KeyError:
                            sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" %
                                      ( (i + 1), f.name, opts.id_attribute ) )
                    if opts.stranded != "no" and f.iv.strand == ".":
                        sys.exit( "Feature %s at %s does not have strand information but you are "
                                  "running htseq-count in stranded mode. Use '--stranded=no'." %
                                  ( f.name, f.iv ) )
                    features[ f.iv ] += feature_id
                    counts[ feature_id ] = [0 for x in colnames]  # we use sami as an index here to bump counts later
        except:
            sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
            raise

        if not opts.quiet:
            sys.stdout.write( "%d GFF lines processed.\n" % i )

        if len( counts ) == 0 and not opts.quiet:
            sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type )
        for sami, sam_filename in enumerate(sam_filenames):
            colname = colnames[sami]
            isbam = sam_exts[sami] == 'bam'
            hasbai = sam_bais[sami] > ''
            if hasbai:
                tempname = os.path.splitext(os.path.basename(sam_filename))[0]
                tempbam = '%s_TEMP.bam' % tempname
                tempbai = '%s_TEMP.bai' % tempname
                os.link(sam_filename, tempbam)
                os.link(sam_bais[sami], tempbai)
            try:
                if isbam:
                    if hasbai:
                        read_seq = HTSeq.BAM_Reader( tempbam )
                    else:
                        read_seq = HTSeq.BAM_Reader( sam_filename )
                else:
                    read_seq = HTSeq.SAM_Reader( sam_filename )
                first_read = iter(read_seq).next()
                pe_mode = first_read.paired_end
            except:
                if isbam:
                    print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename, colname )
                else:
                    print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename, colname )
                raise

            try:
                if pe_mode:
                    read_seq_pe_file = read_seq
                    read_seq = HTSeq.pair_SAM_alignments( read_seq )
                for seqi, r in enumerate(read_seq):
                    nreads += 1
                    if not pe_mode:
                        if not r.aligned:
                            notaligned += 1
                            continue
                        try:
                            if len(opts.filter_extras) > 0:
                                for extra in opts.filter_extras:
                                    if r.optional_field(extra):
                                        filtered += 1
                                        continue
                            if r.optional_field( "NH" ) > 1:
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if r.aQual < mapqMin:
                            lowqual += 1
                            continue
                        if opts.stranded != "reverse":
                            iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
                        else:
                            iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )
                    else:
                        if r[0] is not None and r[0].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
                            else:
                                iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
                        else:
                            iv_seq = tuple()
                        if r[1] is not None and r[1].aligned:
                            if opts.stranded != "reverse":
                                iv_seq = itertools.chain( iv_seq,
                                                          ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                            else:
                                iv_seq = itertools.chain( iv_seq,
                                                          ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                        else:
                            if r[0] is None or not r[0].aligned:
                                notaligned += 1
                                continue
                        try:
                            if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                                    ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                                nonunique += 1
                                continue
                        except KeyError:
                            pass
                        if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ):
                            lowqual += 1
                            continue

                    try:
                        if opts.mode == "union":
                            fs = set()
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[ iv ].steps():
                                    fs = fs.union( fs2 )
                        elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty":
                            fs = None
                            for iv in iv_seq:
                                if iv.chrom not in features.chrom_vectors:
                                    raise UnknownChrom
                                for iv2, fs2 in features[ iv ].steps():
                                    if len(fs2) > 0 or opts.mode == "intersection-strict":
                                        if fs is None:
                                            fs = fs2.copy()
                                        else:
                                            fs = fs.intersection( fs2 )
                        else:
                            sys.exit( "Illegal overlap mode %s" % opts.mode )
                        if fs is None or len( fs ) == 0:
                            empty += 1
                        elif len( fs ) > 1:
                            ambiguous += 1
                        else:
                            ck = list(fs)[0]
                            counts[ck][sami] += 1  # end up with counts for each sample as a list
                    except UnknownChrom:
                        if not pe_mode:
                            rr = r
                        else:
                            rr = r[0] if r[0] is not None else r[1]
                        empty += 1
                        if not opts.quiet:
                            sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " +
                                                "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                                              ( rr.read.name, iv.chrom ) )
            except:
                if not pe_mode:
                    sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
                else:
                    sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
                raise

            if not opts.quiet:
                sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) )
        return counts, empty, ambiguous, lowqual, notaligned, nonunique, filtered, nreads
예제 #22
0
파일: parse.py 프로젝트: NickGoldman/rlsim
 def next_pair(self):
     """ Get next read pair """
     for (first, second) in ht.pair_SAM_alignments(self.read_iter):
         yield (first, second)
예제 #23
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}
   ## added by CR
   dict_nonunique = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
	    ##added by CR
	    dict_nonunique[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      #added by SB
      temp_read_name="NA"
      temp_interval_r0="NA"
      temp_interval_r1="NA"
      ## added by CR	
      nonunique2 = 0
      #added by SB
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
		  #print "Reference i= ", i
                  nonunique += 1
		  #print "%s--%s" % ( r[0].cigar,  r[1].cigar)
               	  if ( r[0] is not None and r[1] is None ):
			result, fs_new = is_read_in_gene_interval(r[0], features)
			if result:
				if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ):
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
				## -- ro: dir(ro) = ['__class__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read', '_read_as_sequenced', 'aQual', 'aligned', 'cigar', 'failed_platform_qc', 'from_SAM_line', 'from_pysam_AlignedRead', 'get_sam_line', 'inferred_insert_size', 'iv', 'mate_aligned', 'mate_start', 'not_primary_alignment', 'optional_field', 'optional_fields', 'original_sam_line', 'paired_end', 'pcr_or_optical_duplicate', 'pe_which', 'proper_pair', 'read', 'read_as_aligned']
				
				#print "## -- ro:  = %s---" % (r[0].original_sam_line)
		  		dict_nonunique[ list(fs_new)[0]] += 1
				#print "R1 %s--> %s " % (fs_new1 ,r[0].iv)
               	  if ( r[0] is None and r[1] is not None ):
			result, fs_new = is_read_in_gene_interval(r[1], features)
			if result:
 				if ((temp_read_name != r[1].read.name) and ( temp_interval_r1 is not r[1].iv) ):
					temp_read_name=r[1].read.name
					temp_interval_r1=r[1].iv

				#print "## -- r1:  = %s---" % (r[1].original_sam_line)
		  		dict_nonunique[ list(fs_new)[0]] += 1
				#print "R2 %s--> %s" % (fs_new ,r[1].iv )
               	  if ( r[0] is not None and r[1] is not None ):
			#print "## -- ro & r1 :: %s-%s" % (r[0].original_sam_line, r[1].original_sam_line)
			#print "%s--%s" % ( r[0].cigar,  r[1].cigar)
			result1, fs_new1 = is_read_in_gene_interval(r[0], features)
			result2, fs_new2 = is_read_in_gene_interval(r[1], features)
			
			if result1 and not result2:
				if ((temp_read_name != r[0].read.name) and ( temp_interval_r0 is not r[0].iv) ):					
					temp_interval_r0=r[0].iv
					#print "before ---%s -" % ( temp_read_name )
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
					#print "after %s" % ( temp_read_name )
		  			dict_nonunique[ list(fs_new1)[0]] += 1
				#print "R1 %s--> %s" % (fs_new1 ,r[0].iv)
			elif result2 and not result1:
				if ((temp_read_name != r[1].read.name)and ( temp_interval_r1 is not r[1].iv)):
					temp_read_name=r[1].read.name
					temp_interval_r1=r[1].iv
				#print "## -- ro & r1: r1"
				#print "%s" % (r[1].read.name )
			  		dict_nonunique[ list(fs_new2)[0]] += 1
				#print "R2 %s--> %s" % (fs_new2 ,r[1].iv)
			elif result1 and result2:
				if ((temp_read_name != r[0].read.name) and (temp_interval_r0 is not r[0].iv ) and \
					( temp_interval_r1 is not r[1].iv) ):
					temp_read_name=r[0].read.name
					temp_interval_r0=r[0].iv
					temp_interval_r1=r[1].iv
				#print "## -- ro & r1: ro&r1"
				#print "%s" % (r[0].original_sam_line)
				#print "---%s:%s -- %s --%s" % (r.count, r.index, r[1].read, r[0].read )
		  		#print "%i---%i---%s---%s " % (result1, result2, fs_new1, fs_new2 )
				
					if list(fs_new1)[0] !=  list(fs_new2)[0]:
						dict_nonunique[ list(fs_new1)[0]] += 1
						dict_nonunique[ list(fs_new2)[0]] += 1
					else:
						dict_nonunique[ list(fs_new1)[0]] += 1
				#dict_nonunique[ list(fs_new1)[0]] += 1
		  		#print "R1_R2, %s--> %s ---%s " % (fs_new1 ,r[0].iv, r[1].iv)
				#dict_nonunique[ list(fs_new2)[0]] += 1
			

#-------------------------Modified by SB------------------------------------------------------
                  #fs_new= set()
		  #print "%s**%s**%s" % (type(r[0]), type(r[0].iv), type(features))
		  #zz=0
		  #for iv3, fs_new2 in features[ r[0].iv ].steps():
                  #	print "%i--%s--%s" % (zz, iv3, fs_new2)	
		  #	zz+=1
		  #	fs_new = fs_new.union( fs_new2 )
		  #CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
                  #CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])	
                  #if not ( (fs_new is None or len( fs_new ) == 0 ) or (len( fs_new ) > 1 ) ) :
                     #added by CR
		     #dict_nonunique[ list(fs_new)[0]] += 1
#---------------------------EOF SB_changes-----------------------------------------------------

                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
		     fs = fs.union( fs2 )
			
		     # added to test SB		    
			#CAS:[1465,1544)/+ *** CAS:[1465,1499)/. -- set([]) --set(['gal5p'])
			#CAS:[1465,1544)/+ *** CAS:[1499,1544)/. -- set(['GALopt']) --set(['gal5p', 'GALopt'])
                     #print "%s *** %s -- %s --%s" % (iv, iv2, fs2, fs)
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
		
               counts[ list(fs)[0] ] += 1
		##aded by CR 2 lines
	       #dict_nonunique[ list(fs)[0]] += nonunique2
		#nonunique2 = 0

         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            if not quiet:
               sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                  "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
                  ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()
   print "Gene\tUnique_reads"
   for fn in sorted( counts.keys() ):
	
	print "%s\t%d" % ( fn, counts[fn])
	
	##print "*%s\t%d" % (fn, dict_nonunique[fn])
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
                    choices=['single_end', 'paired_end'],
                    type=str,
                    default="paired_end",
                    help="""
                    whether the data is from a single-end read library, or a paired-end
                    library. Default is paired-end.
                    """)
args = parser.parse_args()

# define some test files:
samfile = '/home/antqueen/booster/PRO_Odontomachus/trinity_denovo_normalized_camponotus/Star/Cplan_Q2_16Aligned.out.sam'
gtffile = '/home/antqueen/genomics/experiments/analyses/PRO20160405_camponotus/trinity_denovo_normalized_camponotus/Transdecoder_ss/merge_genesets/Cpla_td_gff.Apr21_11.15.families.gtf'

# create gtf iterator
print "\nReading gtf file %s..." % (args.gtf_file[0]),
gtf = hts.GFF_Reader(args.gtf_file[0])
print " done."

# create genomic array and populate with exon features (transcripts and genes)
print "Populating genomic array with GTF features...",
sys.stdout.flush()

if args.stranded == 'yes':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=True)
elif args.stranded == 'no':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=False)

for feature in gtf:
    if feature.type == args.type:
        feature_array[feature.iv] += feature.name
예제 #25
0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one
#Written by Simon Anders
import sys, re
import HTSeq

insam = HTSeq.SAM_Reader( sys.stdin )

# Go through all reads, with their alignments bundled up:
for bundle in HTSeq.bundle_multiple_alignments( insam ):
   bestAlmt = None
   # Go through all alignments of a given read, looking
   # for the one with the best alignment score
   for almt in bundle:
      if bestAlmt is None:
         bestAlmt = almt
      elif almt.aQual > bestAlmt.aQual:
         bestAlmt = almt
      elif almt.aQual == bestAlmt:
         # If there are more than one best alignment, 
         # better skip the read
         bestAlmt = None
   if bestAlmt is not None:
      # Change the NH field to 1 and print the line
      print re.sub( "NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line )
      
#call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam    
예제 #26
0
                if line.startswith(
                        '@'
                ):  # count lines starting with '@' so extra newlines at the end don't throw off the read count
                    n_reads = n_reads + 1
    else:
        with open(filepath) as f:
            for line in f:
                if line.startswith('@'):
                    n_reads = n_reads + 1
    return n_reads


n_reads_r1 = read_count(args.r1)

# read in fastq file:
fastq_r1 = HTSeq.FastqReader(args.r1)
#n_reads_r1=len(list(fastq_r1))         # this was accurate, but way too memory intensive for large fastqs.

PAIRED_END = False  # default is to process single-end reads

if args.r2:
    fastq_r2 = HTSeq.FastaReader(args.r2)
    PAIRED_END = True

    #n_reads_r2=len(list(fastq_r2))
    n_reads_r2 = read_count(args.r2)
    if not n_reads_r1 == n_reads_r2:
        sys.exit("r1 and r2 have different read counts!")

# determine how many reads to return:
if args.percent:
def main():
    exe_parser = argparse.ArgumentParser()
    exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]')
    exe_parser.add_argument("-u", "--not_aligned",
                            help="output reads that were not aligned, including those that were aligned multiple times(flat file).",
                            type=str)
    exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str)
    exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].",
                            type=str)
    exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true")
    exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str)
    exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str)
    exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).",
                            type=int)
    exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int)
    exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int)
    exe_parser.add_argument("-c", "--max_clip",
                            help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float)
    exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str,
                            choices=["y", "n", "reverse"], default="n")
    exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str)
    exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).",
                            type=str)
    exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str)
    exe_parser.add_argument("-p", "--paired_end_mode",
                            help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str,
                            choices=["p", "n"], default="p")
    exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str)
    args = exe_parser.parse_args()

    if args.paired_end_mode == 'p':
        paired_end = True
        pe_order = 'p'
    elif args.paired_end_mode == 'n':
        paired_end = True
        pe_order = 'n'

    if args.infile:
        try:
            if args.infile == '-':  # get sam on a stream
                seqfile = HTSeq.SAM_Reader(sys.stdin)
                if args.paired_end_mode:
                    # read_seq_iter = iter(seqfile)
                    # first_read = read_seq_iter.next()
                    # read_seq = itertools.chain([first_read], read_seq_iter)
                    # reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(seqfile)  # (read_seq)
                else:
                    reader = seqfile
            elif args.infile != '-':
                seqfile = HTSeq.SAM_Reader(args.infile)
                if args.paired_end_mode:
                    read_seq_iter = iter(seqfile)
                    first_read = read_seq_iter.next()
                    read_seq = itertools.chain([first_read], read_seq_iter)
                    reader = HTSeq.pair_SAM_alignments(read_seq)
                    if pe_order == 'p':
                        reader = HTSeq.pair_SAM_alignments_with_buffer(reader)
                    elif pe_order == 'n':
                        reader = HTSeq.pair_SAM_alignments(reader)
                else:
                    reader = seqfile
                    # fread_seq_iter = iter(reader)
                    # first_read = iter(read_seq).next()
            elif args.infile == '':
                print "no input file type given. exiting..."
                sys.exit(1)
        except:
            print "failed processing SAM/BAM file"
            raise
    elif not args.infile:
        print "no input file given. exiting..."
        sys.exit(1)

    if args.gff:
        gff_file = args.gff
    else:
        print "no gff file given. exiting..."
        sys.exit(1)

    if args.verbose:
        verbose = True
    else:
        verbose = False

    if args.min_read_length:
        min_read_len = args.min_read_length
    else:
        min_read_len = 60  # default read length

    if args.max_clip:
        max_clip_ = float(args.max_clip)
    else:
        max_clip_ = float(0.3)  # default read length

    if args.min_id:
        min_id = float(args.min_id)
    else:
        min_id = float(80)

    if args.min_score:
        min_score = int(args.min_score)
    else:
        min_score = 0

    if args.stranded == 'n':
        stranded = 'no'
    elif args.stranded == 'y':
        stranded = 'yes'
    elif args.stranded == 'reverse':
        stranded = 'reverse'

    if args.minaqual:
        minaqual = args.minaqual
    else:
        minaqual = 0

    if args.idattr:
        id_attribute = args.idattr
    else:
        id_attribute = "GeneID"
    if args.type:
        feature_type = args.type
    else:
        feature_type = 'CDS'

    # ###
    # parse GFF file
    features, counts = gff_reader(gff_file, feature_type, id_attribute, verbose, stranded)
    # ###
    if args.samout:
        samoutfile = open(args.samout, "w")
    else:
        samoutfile = None
    if args.ambiguous_out:
        ambiguousfile = open(args.ambiguous_out, "w")
    else:
        ambiguousfile = None
    if args.fasta:
        fastafile = open(args.fasta, "w")
    else:
        fastafile = None
    if args.not_aligned:
        not_aligned_file = open(args.not_aligned, "w")
    else:
        not_aligned_file = None
    if args.out:
        outfile = open(args.out, "w")
    else:
        outfile = None

        # if outfile and samoutfile and  ambiguousfile and fastafile and not_aligned_file == None:
        # print "None of the possible output file options specified. exiting..."
        # sys.exit(1)
    # #######
    # decalre counter variables
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    # #######

    read_counter = 0
    for alignment in reader:  # for alignment entry (line in fact) in sam file
        # iv_seq
        # print alignment
        if not paired_end:
            if read_counter % 1000000 == 0 and verbose:
                if verbose:
                    print read_counter, 'non paired-end alignments processed'
            read_name = alignment.read.name
            # read = alignment.read  # READ. Note that def invert_strand( iv ):
            read_seq = alignment.read.seq
            read_length = len(alignment.read.seq)
            if not alignment.aligned:  # check if read is aligned to ref sequence
                if alignment is not None:
                    notaligned += 1
                    if args.samout:
                        write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                    if args.not_aligned:
                        not_aligned_file.write(read_name + '\t' + 'not_aligned' + '\n')
                        # continue
            elif alignment.aligned:

                opt_fields = alignment.optional_fields
                # flag = alignment.flag
                cigar_string = parse_cigar(alignment.original_sam_line.split('\t')[
                    5])  # just the cigar string without the fancy HTseq additions
                cigar_soft_clipped, cigar_m, cigar_insertions, cigar_deletions, cigar_insertions = parse_cigar_alignment(cigar_string)  # get alignment data from cigar string
                score, md_matches, md_deletions, md_mismatches = parse_opt_fields(
                    opt_fields)  # get alignment data from md string
                percent_id = 100.0 * (
                    float(md_matches) / (float(read_length - cigar_soft_clipped + cigar_insertions + cigar_deletions)))
                if alignment[0] is not None:  # check if read is aligned to ref sequence
                    if alignment.optional_field("NH") > 1:  # check if read is mapped more than once
                        # By default these reads are discarded. CHANGE?
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "alignment_not_unique")
                        nonunique += 1
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'alignment_not_unique' + '\n')
                            # continue
                    if alignment.aQual < minaqual:  # check quality. default is 0
                        lowqual += 1
                        if args.samout:
                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                        if args.not_aligned:
                            not_aligned_file.write(read_name + '\t' + 'too_low_aQual' + '\n')
                            # continue
                    clipped = (float(cigar_soft_clipped) / float(read_length))
                    if read_length >= min_read_len:
                        if (float(cigar_soft_clipped) / float(read_length)) <= max_clip_:
                            if score >= args.min_score:
                                if percent_id >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = (
                                            (invert_strand(cigar_operation.ref_iv) for cigar_operation in
                                             alignment[1].cigar
                                             if cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = (cigar_operation.ref_iv for cigar_operation in alignment.cigar if
                                                  cigar_operation.type == "M" and cigar_operation.size > 0)
                                    iv_seq_good = True
                                    # collects hits to chromosomes/features.
                                    """
                                    cigarOperation in HTSeq:
                                    HTSeq.parse_cigar( "20M6I10M", 1000, "chr2", "+" ) #ref_iv == genomicInterval object
                                    of htSeq
                                    [< CigarOperation: 20 base(s) matched on ref iv chr2:[1000,1020)/+,query iv[0,20)>,
                                    < CigarOperation: 6 base(s) inserted on ref iv chr2:[1020,1020)/+,query iv[20,26)>,]
                                    """
                                    # if args.fasta:
                                    # fastafile.write('>' + read_name + '\n' + read_seq + '\n')

                                else:
                                    iv_seq_good = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_name + '\t' + 'percent_id_too_low=' + str(percent_id) + '\n')
                            else:
                                iv_seq_good = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_name + '\t' + 'alignment_score_too_low=' + str(score) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_soft_clipped) + '\n')
        elif paired_end:
            # print "read counter=", read_counter
            if read_counter % 100000 == 0 and verbose:
                if verbose:
                    print read_counter, 'alignment pairs processed'
            if (alignment[0] is None) or not alignment[0].aligned:
                notaligned += 1
                try:
                    read_1_name = alignment[0].read.name
                except:
                    read_1_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
            elif (alignment[1] is None) or not alignment[1].aligned:
                notaligned += 1
                try:
                    read_2_name = alignment[1].read.name
                except:
                    read_2_name = 'None'
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "not_aligned")
                if args.not_aligned:
                    not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
            else:
                # else:
                read_1_name = alignment[0].read.name
                # read_1 = alignment[0].read  #READ.
                read_1_length = len(alignment[0].read.seq)
                read_1_seq = alignment[0].read.seq
                read_2_name = alignment[1].read.name
                # read_2 = alignment[1].read  #READ.
                # read_2_length = len(alignment[1].read.seq)
                read_2_seq = alignment[1].read.seq
                iv_seq = tuple()
                if (alignment[0] is not None) and alignment[0].aligned:  # check if read is aligned to ref sequence
                    opt_1_fields = alignment[0].optional_fields
                    # flag_1 = alignment[0].flag
                    cigar_1_string = parse_cigar(alignment[0].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_1_soft_clipped, cigar_1_m, cigar_1_insertions, cigar_1_deletions, cigar_1_insertions = parse_cigar_alignment(
                        cigar_1_string)
                    score_1, md_1_matches, md_1_deletions, md_1_mismatches = parse_opt_fields(
                        opt_1_fields)  # get alignment data from md string
                    percent_1_id = (100.0 * ((float(md_1_matches) / (
                        float(read_1_length - cigar_1_soft_clipped + cigar_1_insertions + cigar_1_deletions)))))
                    clipped_1 = (float(cigar_1_soft_clipped) / float(read_1_length))
                    if int(read_1_length) >= int(min_read_len):
                        if (float(cigar_1_soft_clipped) / float(read_1_length)) <= float(max_clip_):

                            # if int(score_1) >= int(args.min_score):
                            if int(score_1) >= int(min_score):
                                # if float(percent_1_id) >= float(args.min_id):
                                if float(percent_1_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[0].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    # if args.fasta:
                                    # fastafile.write('>' + read_1_name + '\n' + read_1_seq + '\n')
                                    iv_seq_good_1 = True

                                else:
                                    iv_seq_good_1 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_1_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_1_name + '\t' + 'percent_id_too_low=' + str(percent_1_id) + '\n')
                            else:
                                iv_seq_good_1 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_1))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_1_name + '\t' + 'alignment_score_too_low=' + str(score_1) + '\n')
                        else:
                            iv_seq_good = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_1_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_1_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_1_soft_clipped) + '\n')
                # else:
                # iv_seq = tuple()

                if (alignment[1] is not None) and alignment[1].aligned:  # check if read is aligned to ref sequence
                    opt_2_fields = alignment[1].optional_fields
                    # flag_2 = alignment[1].flag  # ',  #'bit_length', 'conjugate', 'denominator', 'imag', 'numerator', 'real']
                    cigar_2_string = parse_cigar(alignment[1].original_sam_line.split('\t')[
                        5])  # just the cigar string without the fancy HTseq additions
                    cigar_2_soft_clipped, cigar_2_m, cigar_2_insertions, cigar_2_deletions, cigar_2_insertions = parse_cigar_alignment(
                        cigar_2_string)
                    score_2, md_2_matches, md_2_deletions, md_2_mismatches = parse_opt_fields(
                        opt_2_fields)  # get alignment data from md string
                    read_2_name = alignment[1].read.name
                    read_2_length = len(alignment[1].read.seq)
                    # read_2 = alignment[1].read  # READ.
                    read_2_seq = alignment[1].read.seq
                    percent_2_id = (100.0 * (float(md_2_matches) / (
                        float(read_2_length - cigar_2_soft_clipped + cigar_2_insertions + cigar_2_deletions))))
                    clipped_2 = (float(cigar_2_soft_clipped) / float(read_2_length))
                    if int(read_2_length) >= int(min_read_len):
                        if (float(cigar_2_soft_clipped) / float(read_2_length)) <= float(max_clip_):
                            if int(score_2) >= int(min_score):
                                if float(percent_2_id) >= float(min_id):
                                    if stranded == "reverse":
                                        iv_seq = itertools.chain(iv_seq, (invert_strand(cigar_operation.ref_iv) for
                                                                          cigar_operation in alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                    else:
                                        iv_seq = itertools.chain(iv_seq, (cigar_operation.ref_iv for cigar_operation in
                                                                          alignment[1].cigar if
                                                                          cigar_operation.type == "M" and cigar_operation.size > 0))
                                        iv_seq_good_2 = True
                                    try:
                                        if (alignment[0].optional_field("NH") > 1) or (alignment[1].optional_field(
                                                "NH") > 1):
                                            # or (alignment[1].optional_field("NH") > 1): #check if read is mapped more
                                            # than once
                                            # By default these reads are discarded. CHANGE?
                                            iv_seq_good_1 = False
                                            iv_seq_good_2 = False
                                            if args.samout:
                                                write_to_samout(samoutfile, paired_end, alignment,
                                                                "alignment_not_unique")
                                                nonunique += 1
                                            if args.not_aligned:
                                                not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                                not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                                continue
                                    except KeyError:
                                        pass
                                    if (alignment[0] and alignment[0].aQual < minaqual) or (alignment[1] and alignment[1].aQual < minaqual):
                                        # check quality. default is 0
                                        iv_seq_good_2 = False
                                        lowqual += 1
                                        if args.samout:
                                            write_to_samout(samoutfile, paired_end, alignment, "too_low_aQual")
                                        if args.not_aligned:
                                            not_aligned_file.write(read_1_name + '\t' + 'not_aligned' + '\n')
                                            not_aligned_file.write(read_2_name + '\t' + 'not_aligned' + '\n')
                                        continue
                                else:
                                    iv_seq_good_2 = False
                                    if args.samout:
                                        write_to_samout(samoutfile, paired_end, alignment,
                                                        "percent_id_too_low=" + str(percent_2_id))
                                    if args.not_aligned:
                                        not_aligned_file.write(
                                            read_2_name + '\t' + 'percent_id_too_low=' + str(percent_2_id) + '\n')
                            else:
                                iv_seq_good_2 = False
                                if args.samout:
                                    write_to_samout(samoutfile, paired_end, alignment,
                                                    'alignment_score_too_low=' + str(score_2))
                                if args.not_aligned:
                                    not_aligned_file.write(
                                        read_2_name + '\t' + 'alignment_score_too_low=' + str(score_2) + '\n')
                        else:
                            iv_seq_good_2 = False
                            if args.samout:
                                write_to_samout(samoutfile, paired_end, alignment,
                                                'too_many_bases_clipped_from_read=' + str(cigar_2_soft_clipped))
                            if args.not_aligned:
                                not_aligned_file.write(read_2_name + '\t' + 'too_many_bases_clipped_from_read=' + str(
                                    cigar_2_soft_clipped) + '\n')
        read_counter += 1

        """
        overlap_mode == "union"
        will count a hit even if read is mapped across an intron or there is an insertion.
        """
        try:
            feature_set = set()
            for iv in iv_seq:
                # print iv
                if iv.chrom not in features.chrom_vectors:  # check if alignment feaure name in features from GFF file
                    # The name of a sequence (i.e., chromosome, contig, or the like).
                    # check the gff features dictionary
                    raise UnknownChrom
                for iv2, fs2 in features[iv].steps():  # fs == feature steps.
                    """
                    from HTseq manual:
                    GenomicArray objects use by default so-called StepVectors that store the data internally in steps of
                    constant value
                    """
                    feature_set = feature_set.union(fs2)
                    # print feature_set
            if feature_set is None or len(feature_set) == 0:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "no_feature")
                if args.not_aligned:
                    not_aligned_file.write('None' + '\t' + 'no_feature' + '\n')
                empty += 1
            elif len(feature_set) > 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, "ambiguous[" + '+'.join(feature_set) + "]")
                if ambiguousfile:
                    if paired_end:
                        if iv_seq_good_1:
                            ambiguousfile.write('>' + read_1_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_1) + '_score_' + str(score_2) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            ambiguousfile.write('>' + read_2_name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            ambiguousfile.write('>' + alignment.read.name + '_' + "ambiguous[" + '+'.join(
                                feature_set) + "]" + '_clipped_' + str(clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                """
                #if args.not_aligned:
                #    if paired_end:
                #    not_aligned_file.write(alignment[0].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #        not_aligned_file.write(alignment[1].read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                #    else:
                #    not_aligned_file.write(alignment.read.name + '\t' + 'ambiguous['+'+'.join(feature_set)+']' + '\n')
                """
                ambiguous += 1
            elif len(feature_set) == 1:
                if args.samout:
                    write_to_samout(samoutfile, paired_end, alignment, list(feature_set)[0])
                if args.fasta:
                    if paired_end:
                        if iv_seq_good_1:
                            fastafile.write('>' + read_1_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_1) + '_score_' + str(score_1) + '_percent_id_' + str(percent_1_id) + '\n' + read_1_seq + '\n')
                        if iv_seq_good_2:
                            fastafile.write('>' + read_2_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped_2) + '_score_' + str(score_2) + '_percent_id_' + str(percent_2_id) + '\n' + read_2_seq + '\n')
                    else:
                        if iv_seq_good:
                            fastafile.write('>' + read_name + '_' + ''.join(list(feature_set)[0]) + '_clipped_' + str(
                                clipped) + '_score_' + str(score) + '_percent_id_' + str(percent_id) + '\n' + read_seq + '\n')

                counts[list(feature_set)[0]] += 1
        except:
            if args.samout:
                write_to_samout(samoutfile, paired_end, alignment, "__no_feature")
            empty += 1

            # if not paired_end:
            # al = alignment
            # else:
            # al = alignment[0] if alignment[0] is not None else alignment[1]

            # if args.not_aligned:
            # not_aligned_file.write(al.read.name + '\t' + 'feature_not_in_gff_file' + '\n')
            # if not verbose:
            #    print (("Warning: Skipping read '%s', because chromosome " +
            #    "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
            #     (al.read.name, iv.chrom) )
    print 'total', read_counter, 'alignments processed'
    if samoutfile is not None:
        samoutfile.close()
    if fastafile is not None:
        fastafile.close
    if not_aligned_file is not None:
        not_aligned_file.close()

    if outfile is not None:
        for feature in sorted(counts.keys()):
            outfile.write("%s\t%d\n" % (feature, counts[feature]))
        outfile.write("no_feature\t%d\n" % empty)
        outfile.write("ambiguous\t%d\n" % ambiguous)
        outfile.write("too_low_aQual\t%d\n" % lowqual)
        outfile.write("not_aligned\t%d\n" % notaligned)
        outfile.write("alignment_not_unique\t%d\n" % nonunique)
    if outfile is not None:
        outfile.close()
예제 #28
0
def build_gene_model(g, GFF_dict):
    """return gene model of a gene"""
    """define with codon_no, and codon partition"""
    gene_model=HTSeq.GenomicArrayOfSets( "auto", stranded=False )
    exon_no=int(GFF_dict[g]['exonCount'])
    exon_start=[int(j) for j in GFF_dict[g]['exonStarts'].split(",")[:exon_no]]
    exon_end=[int(j) for j in GFF_dict[g]['exonEnds'].split(",")[:exon_no]]
    ###print g
    if GFF_dict[g]['strand']=="-":
        start_codon=int(GFF_dict[g]['cdsEnd'])
        stop_codon=int(GFF_dict[g]['cdsStart'])
        exon_start=list(reversed(exon_start))
        exon_end=list(reversed(exon_end))
        start_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<start_codon and e>=start_codon][0]
        pre_exon=len([[s, e] for s, e in zip(exon_start, exon_end) if e>=start_codon])
        end_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<=stop_codon and e>stop_codon][0]
    else:
        start_codon=int(GFF_dict[g]['cdsStart'])
        stop_codon=int(GFF_dict[g]['cdsEnd'])
        start_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<=start_codon and e>start_codon][0]
        pre_exon=len([[s, e] for s, e in zip(exon_start, exon_end) if s<=start_codon])
        end_exon=[[s, e] for s, e in zip(exon_start, exon_end) if s<stop_codon and e>=stop_codon][0]
    in_between_codon=[[s, e] for s, e in zip(exon_start, exon_end)]
    Start_index=in_between_codon.index(start_exon)
    End_index=in_between_codon.index(end_exon)
    if GFF_dict[g]['strand']=="-":
        start_exon=(start_exon[0], start_codon)
        end_exon=(stop_codon, end_exon[1])
    else:
        start_exon=[start_codon, start_exon[1]]
        end_exon=[end_exon[0], stop_codon]
    exons_cood=[start_exon]
    exons_cood.extend(in_between_codon[Start_index+1:End_index])
    exons_cood.append(end_exon) 
    cDNA_part=0 
    exon_no=pre_exon-1  
    codon_n=1
    codon_partition=0
    if GFF_dict[g]['strand']=="-":
        for i in exons_cood:
            exon_no+=1
            for location in list(reversed(range(i[0], i[1]))):
                cDNA_part+=1
                if codon_partition==3:
                    codon_n+=1
                    codon_partition=0
                in_name=str(exon_no)+"_"+str(codon_n)+"_"+str(cDNA_part)+"_"+str(codon_partition)
                codon_partition+=1
                gene_model[HTSeq.GenomicInterval(GFF_dict[g]['chrom'], location, location+1)]+=in_name
    else:
        for i in exons_cood:
            exon_no+=1
            for location in range(i[0], i[1]):
                cDNA_part+=1
                if codon_partition==3:
                    codon_n+=1
                    codon_partition=0
                in_name=str(exon_no)+"_"+str(codon_n)+"_"+str(cDNA_part)+"_"+str(codon_partition)
                codon_partition+=1
                gene_model[HTSeq.GenomicInterval(GFF_dict[g]['chrom'], location, location+1)]+=in_name
    return gene_model
예제 #29
0
	def intersectcirc(self, circ_file, modified_gtf_file):
		# imput the result file of print_start_end_file
		import pybedtools
		#intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2
		circ = pybedtools.BedTool(circ_file)
		gtf = pybedtools.BedTool(modified_gtf_file)
		intersectfile = circ.intersect(gtf,wa=True,wb=True,loj=True)
		# Store circExons as: circle start or end intervals as key, custom_exon_id as value
		circExons = {}
		for lin in intersectfile:
			lin_split = str(lin).split('\t')
			if lin_split[11].strip('\n') == '.':
				#lin_split[11] = ''
				pass
			else:
				circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), set() ).add( HTSeq.parse_GFF_attribute_string(lin_split[11])['custom_exon_id'] )
			#circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) })
		return circExons
예제 #30
0
    def add_raw_reads_to_a_peak_region(self, peak, ga, range_to_mark=None):
        peak_center = int(float(peak[1] + peak[0]) / 2.)
        #print "Adding raw reads to a peak in %s" % self.gene_name
        left_raw = []
        right_raw = []
        if peak_center <= 1000:
            left_border = 1
            to_pad = peak_center - 1000 - 1
            left_raw = [0] * to_pad
        else:
            left_border = peak_center - 1000
        right_border = peak_center + 1000
        if peak_center - left_border < 2: return [0]
        if range_to_mark is not None:
            marks = []
            left_marks_border = max(range_to_mark[0], left_border)
            right_marks_border = min(range_to_mark[1], right_border)
            # Case 1: the range to mark does not overlap the peak range.
            if not ((left_border <= range_to_mark[0] <= right_border) or
                    (left_border <= range_to_mark[1] <= right_border)):
                marks += [0] * (int(right_border) - int(left_border))
            else:

                marks += [0] * (left_marks_border - left_border)
                marks += [1] * (right_marks_border - left_marks_border)
                marks += [0] * (right_border - right_marks_border)
            return marks
            nope = '''
            # Case 2: the range to mark is within the peak range.
            elif (
                        (left_border <= range_to_mark[0] <= right_border) and (
                        left_border <= range_to_mark[1] <= right_border)
                    ):
                marks += [0] * (left_marks_border - left_border)
                marks += [1] * (right_marks_border - left_marks_border)
                marks += [0] * (right_border - right_marks_border)
            # Case 3: only the right range overlaps.
            elif (
                    (left_border > range_to_mark[0]) and (
                        left_border <= range_to_mark[1] <= right_border)
                    ):

            )
            if int(left_border) < range_to_mark[0] < int(right_border):
                marks += [0] * (range_to_mark[0] - int(left_border))
                marks += [1] * (min(range_to_mark[1], int(right_border)) - range_to_mark[0])
            if range_to_mark[1] > range_to_mark[0]:
                marks += [1] * (range_to_mark[1] - range_to_mark[0])
            if peak_center + 1000 > range_to_mark[1]:
                marks += [0] * (peak_center + 1000 - range_to_mark[1])
            return marks'''
        left_iv = HTSeq.GenomicInterval(self.chrom, int(left_border),
                                        peak_center, self.strand)
        right_iv = HTSeq.GenomicInterval(self.chrom, peak_center,
                                         peak_center + 1000, self.strand)
        for iv, score in ga[left_iv].steps():
            left_raw += [score] * (iv.end - iv.start)
        for iv, score in ga[right_iv].steps():
            right_raw += [score] * (iv.end - iv.start)
        peak_raw = left_raw + right_raw
        return peak_raw
예제 #31
0
def count_reads(features, counts, pe_mode, read_seq, order, stranded, 
      overlap_mode, quiet, minaqual, write_to_samout ):
      
    if pe_mode:
        if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
        elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
        else:
            raise ValueError, "Illegal order specified."
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0   
    for r in read_seq:
        if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

        i += 1
        if not pe_mode:
            if not r.aligned:
                notaligned += 1
                write_to_samout( r, "__not_aligned" )
                continue
            try:
                if r.optional_field( "NH" ) > 1:
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if r.aQual < minaqual:
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue
            if stranded != "reverse":
                iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
        else:
            if r[0] is not None and r[0].aligned:
                if stranded != "reverse":
                    iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
                else:
                    iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
                iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
                if stranded != "reverse":
                    iv_seq = itertools.chain(iv_seq, 
                        ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
                else:
                    iv_seq = itertools.chain( iv_seq, 
                        ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
                if ( r[0] is None ) or not ( r[0].aligned ):
                    write_to_samout( r, "__not_aligned" )
                    notaligned += 1
                    continue         
            try:
                if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                         ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                    nonunique += 1
                    write_to_samout( r, "__alignment_not_unique" )
                    continue
            except KeyError:
                pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
                lowqual += 1
                write_to_samout( r, "__too_low_aQual" )
                continue         
         
        try:
            if overlap_mode == "union":
                fs = set()
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom
                    for iv2, fs2 in features[ iv ].steps():
                        if len(fs2) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection( fs2 )
            else:
                sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
                write_to_samout( r, "__no_feature" )
                empty += 1
            elif len( fs ) > 1:
                write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
                ambiguous += 1
            else:
                write_to_samout( r, list(fs)[0] )
                counts[ list(fs)[0] ] += 1
        except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

    if not quiet:
        sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
    for fn in sorted( counts.keys() ):
        print "%s\t%d" % ( fn, counts[fn] )
    print "__no_feature\t%d" % empty
    print "__ambiguous\t%d" % ambiguous
    print "__too_low_aQual\t%d" % lowqual
    print "__not_aligned\t%d" % notaligned
    print "__alignment_not_unique\t%d" % nonunique
args = parser.parse_args()
sample_name = args.sample_name
chr_no = args.chr_no
bam_file = args.bam_file
ref_file = args.ref_file
print("chr_no ", chr_no)
snp_file = args.snp_file
indel_file = args.indel_file
if (args.chr_prefix):
    chr = args.chr_prefix + str(chr_no)
else:
    chr = str(chr_no)

sequence = {}
for s in HTSeq.FastaReader(ref_file):
    sequence[s.name] = s
reference_seq = sequence["chr" + str(chr_no)]
pos_ref = 0
samfile = pysam.Samfile(bam_file, "rb")

haplotyped_snp_file = subprocess.Popen(['tabix', snp_file, chr_no],
                                       stdout=subprocess.PIPE)
haplotyped_indel_file = subprocess.Popen(['tabix', indel_file, chr_no],
                                         stdout=subprocess.PIPE)

#d={'hc':0,'hd':0,'bt':0,'ot':0,'rf':0,'fr':0}

haplotypeC_bam = pysam.Samfile("haplotypeC_" + chr + ".bam",
                               "wb",
                               template=samfile)
def count_reads_onto_prebuilt_features(
    sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False
):
    def write_to_samout(r, assignment):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n")

    if quiet:
        warnings.filterwarnings(action="ignore", module="HTSeq")

    if samout != "":
        samoutfile = open(samout, "w")
    else:
        samoutfile = None

    if umis:
        umi_re = re.compile(":UMI:(\w+):")
        umi_counts = {}

        def count_umis(fs, read_name):
            umi_seq = umi_re.search(read_name).group(1)
            umi_counts[fs][umi_seq] += 1

        for feature_id in feature_ids:
            umi_counts[feature_id] = Counter()
    else:

        def count_umis(x, y):
            return None

    # Try to open samfile to fail early in case it is not there
    if sam_filename != "-":
        open(sam_filename).close()

    counts = {}
    for feature_id in feature_ids:
        counts[feature_id] = 0

    try:
        if sam_filename != "-":
            read_seq_file = HTSeq.SAM_Reader(sam_filename)
            read_seq = read_seq_file
            first_read = iter(read_seq).next()
        else:
            read_seq_file = HTSeq.SAM_Reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = read_seq_iter.next()
            read_seq = itertools.chain([first_read], read_seq_iter)
        pe_mode = first_read.paired_end
    except StopIteration:
        raise EmptySamError(sam_filename)

    try:
        if pe_mode:
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    write_to_samout(r, "not_aligned")
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        write_to_samout(r, "alignment_not_unique")
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)
                        )
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0)
                        )
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        write_to_samout(r, "not_aligned")
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or (
                        r[1] is not None and r[1].optional_field("NH") > 1
                    ):
                        nonunique += 1
                        write_to_samout(r, "alignment_not_unique")
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and r[1].aQual < minaqual):
                    lowqual += 1
                    write_to_samout(r, "too_low_aQual")
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(fs2) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    write_to_samout(r, "no_feature")
                    empty += 1
                elif len(fs) > 1:
                    write_to_samout(r, "ambiguous[" + "+".join(fs) + "]")
                    ambiguous += 1
                else:
                    write_to_samout(r, list(fs)[0])
                    counts[list(fs)[0]] += 1
                    count_umis(list(fs)[0], r.read.name)
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1
                # if not quiet:
                #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
                #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
                #      ( rr.read.name, iv.chrom ) )

            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs"))

    except:
        sys.stderr.write("Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d sam %s processed.\n" % (i, "lines " if not pe_mode else "line pairs"))

    if samoutfile is not None:
        samoutfile.close()

    # sorted feature list. features+counts
    feats = [fn for fn in sorted(counts.keys())]
    if umis:
        counts = [len(umi_counts[fn]) for fn in feats]
    else:
        counts = [counts[fn] for fn in feats]
    # cat statistics summary to feature+count list
    feats = feats + ["no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique"]
    counts = counts + [empty, ambiguous, lowqual, notaligned, nonunique]
    return (feats, counts)
예제 #34
0
# Deal with any GFF file reading errors
except ValueError as e:
    e.args += ( gff.get_line_number_string(), )
    raise

try:
    # Get the first read to see if we're dealing with paired-end data
    read_seq = HTSeq.SAM_Reader(options.sam)
    first_read = iter(read_seq).next()
    pe_mode = first_read.paired_end
    
    # Re-initialize read_seq depending on if it's paired-end data or not
    read_seq = HTSeq.SAM_Reader(options.sam)
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments(read_seq)

    # Read counter, for feedback to user
    i = 0 
    total = 0
    # Here we go, through each read...
    for r in read_seq:
        spliced = False
        if not pe_mode:
            if not r.aligned:
                continue
            total += 1
            iv_seq = []

            # Check to see if it's spliced
            for co in r.cigar:
예제 #35
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0


    if file_exists(out_file):
        return out_file

    logger.info("Counting reads mapping to exons in %s using %s as the "
                    "annotation and strandedness as %s." % (os.path.basename(sam_filename),
                    os.path.basename(gff_filename), _get_strandedness(data["config"])))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n"
                         % gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n"
                         % feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M"
                              and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if
                              co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(iv_seq,
                                                 (invert_strand(co.ref_iv) for co
                                                  in r[1].cigar if co.type == "M"
                                                  and co.size > 0))
                    else:
                        iv_seq = itertools.chain(iv_seq,
                                                 (co.ref_iv for co in r[1].cigar
                                                  if co.type == "M" and co.size
                                                  > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                      overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0 or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write("%d sam %s processed.\n" %
                                 ( i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq_pe_file.get_line_number_string() )
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file
예제 #36
0
def count_reads_in_features(sam_filename, gff_filename, samtype, order,
                            stranded, overlap_mode, feature_type, id_attribute,
                            quiet, minaqual, samout, include_non_annotated,
                            htseq_no_ambiguous, outputDiscarded):
    """
    This is taken from the function count_reads_in_features() from the 
    script htseq-count in the HTSeq package version 0.61.p2 
    The reason to do so is to fix two really small bugs related to the SAM output.
    The code of the function is small and simple so for now we
    will use the patched function here. A patch request has been sent
    to the HTSeq team.
    The description of the parameters are the same as htseq-count.
    Two parameters were added to filter out what to write in the sam output
    
    The HTSEQ License
    HTSeq is free software: you can redistribute it and/or modify it under the terms of 
    the GNU General Public License as published by the Free Software Foundation, 
    either version 3 of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, 
    but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
    or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

    The full text of the GNU General Public License, version 3, 
    can be found here: http://www.gnu.org/licenses/gpl-3.0-standalone.html
    """
    # Set up the filters
    count_reads_in_features.filter_htseq = \
    ["__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
    if not include_non_annotated:
        count_reads_in_features.filter_htseq.append("__no_feature")
    count_reads_in_features.filter_htseq_no_ambiguous = htseq_no_ambiguous

    # Open SAM/BAM output file
    flag_write = "wb" if samtype == "bam" else "wh"
    flag_read = "rb" if samtype == "bam" else "r"
    saminfile = pysam.AlignmentFile(sam_filename, flag_read)
    count_reads_in_features.samoutfile = pysam.AlignmentFile(
        samout, flag_write, template=saminfile)
    if outputDiscarded is not None:
        count_reads_in_features.samdiscarded = pysam.AlignmentFile(
            outputDiscarded, flag_write, template=saminfile)
    saminfile.close()

    # Counter of annotated records
    count_reads_in_features.annotated = 0

    # Function to write to SAM output
    def write_to_samout(read, assignment):
        # Creates the PySAM record
        # to_pysam_AlignedSegment is the new method in HTSeq>=0.7.0 that
        # uses the latest Pysam API and reports the correct sequences
        sam_record = read.to_pysam_AlignedSegment(
            count_reads_in_features.samoutfile)
        sam_record.set_tag("XF", assignment, "Z")
        if read is not None and assignment not in count_reads_in_features.filter_htseq \
        and not (count_reads_in_features.filter_htseq_no_ambiguous and assignment.find("__ambiguous") != -1):
            count_reads_in_features.samoutfile.write(sam_record)
            count_reads_in_features.annotated += 1
        elif outputDiscarded is not None:
            count_reads_in_features.samdiscarded.write(sam_record)

    # Annotation objects
    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}
    gff = HTSeq.GFF_Reader(gff_filename)

    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError, ("Feature %s does not contain a '%s' attribute" \
                                       % (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError, ("Feature %s at %s does not have strand information but you are " \
                                       "running htseq-count in stranded mode. Use '--stranded=no'." %
                                       (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
    except:
        raise

    if len(counts) == 0:
        raise RuntimeError, "No features of type '%s' found.\n" % feature_type

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
    else:
        raise ValueError, "Unknown input format %s specified." % samtype

    try:
        read_seq = SAM_or_BAM_Reader(sam_filename)
    except:
        raise RuntimeError, "Error occurred when reading beginning of SAM/BAM file."

    try:

        for r in read_seq:
            if not r.aligned:
                write_to_samout(r, "__not_aligned")
                continue
            try:
                if r.optional_field("NH") > 1:
                    write_to_samout(r, "__alignment_not_unique")
                    # Should these reads potentially be printed twice?
                    # should there not be a continue statement here?
                    # otherwise the read will move on through the if statemets
                    # until it gets a gene id annotation and will be printed again?
            except KeyError:
                pass
            except Exception as e:
                raise e
            if r.aQual < minaqual:
                write_to_samout(r, "__too_low_aQual")
                continue
            if stranded != "reverse":
                iv_seq = (co.ref_iv for co in r.cigar
                          if co.type == "M" and co.size > 0)
            else:
                iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                          if co.type == "M" and co.size > 0)
            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if len(
                                    fs2
                            ) > 0 or overlap_mode == "intersection-strict":
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    raise RuntimeError, "Illegal overlap mode."

                if fs is None:
                    continue
                elif len(fs) == 0:
                    write_to_samout(r, "__no_feature")
                elif len(fs) > 1:
                    write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]")
                else:
                    write_to_samout(r, list(fs)[0])

            except UnknownChrom:
                pass

    except:
        count_reads_in_features.samoutfile.close()
        if outputDiscarded is not None:
            count_reads_in_features.samdiscarded.close()
        raise

    count_reads_in_features.samoutfile.close()
    if outputDiscarded is not None:
        count_reads_in_features.samdiscarded.close()
    return count_reads_in_features.annotated
예제 #37
0
파일: count.py 프로젝트: gturco/htseq_tools
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}
   gene_length = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
 
   counts, colgenes = parse_gff(gff_filename,features,feature_type,id_attribute,stranded,quiet,counts)
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   ################# read sam file #######################
   try:
      if sam_filename != "-":
          read_seq = HTSeq.SAM_Reader( sam_filename )
          first_read = iter(read_seq).next()
      else:
          read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
          first_read = read_seq.next()
          read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise
   ################ read sam file #######################
   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     ## what is within the genomic interval of iv
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            empty += 1
            #if not quiet:
            #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
            #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
            #      ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print i, sum(counts.values())
      rpkm, feature_len = get_rpkm(counts[fn],i,colgenes[fn])
      print "%s\t%d\t%d\t%d" % ( fn, counts[fn], feature_len,rpkm)
   print "no_feature\t%d" % empty
   print "ambiguous\t%d" % ambiguous
   print "too_low_aQual\t%d" % lowqual
   print "not_aligned\t%d" % notaligned
   print "alignment_not_unique\t%d" % nonunique
예제 #38
0
import sys
import matplotlib.pyplot as plt
if len(sys.argv) < 3:
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
if  not (input_file.endswith(".sam") and output_file.endswith(".fastq")):
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
import HTSeq
import numpy as np
alignment_file = HTSeq.SAM_Reader(input_file)
len_reads=[]
my_fastq_file = open( output_file, "w" )
for aln in alignment_file:
    if not aln.aligned:
        len_reads.append(len(aln.read.seq))
        if len(aln.read.seq)>200:
            myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr )
            myread.write_to_fastq_file( my_fastq_file )
my_fastq_file.close()
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(len_reads, bins=10)
plt.savefig(output_file+".png")
예제 #39
0
      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
      if len( set_of_gene_names ) == 0:
         counts[ '_empty' ] += 1
      elif len( set_of_gene_names ) > 1:
         counts[ '_ambiguous' ] +=1
      else:
         for f in rs:
            counts[ f.name ] += 1
      num_reads += 1
      if num_reads % 100000 == 0:
         sys.stderr.write( "%d reads processed.\n" % num_reads )

else: # paired-end

   num_reads = 0
   for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ):
      rs = set()
      if af and ar and not af.aligned and not ar.aligned:
         counts[ '_notaligned' ] += 1
         continue
      if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
         counts[ '_lowaqual' ] += 1
         continue
      if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys():
         for cigop in af.cigar:
            if cigop.type != "M":
               continue
            if reverse:
               cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
            for iv, s in features[cigop.ref_iv].steps():
               rs = rs.union( s )
예제 #40
0
def annotate_table(exp_design_name, mouse_seq, ref_peaks_list, transcript_list, PATH_PEAKS, col_name):
    """
    Read list of windows_indexes and add :
        - annotation
        - motif presence score
        - relative position on transcript
        - overlapping refpeaks
        -

    INPUT: PATH_PEAKS + exp_design_name + '_windows_indexes.txt'
           PATH_PEAKS + exp_design_name + '_windows_list_Annot.txt'
    OUTPUT: PATH_PEAKS + exp_design_name + '_windows_indexes_Annot.txt'

    :param exp_design_name:
    :param mouse_seq:
    :param ref_peaks_list:
    :param PATH_PEAKS:
    :return:
    """

    with open(PATH_PEAKS + col_name + '/' + exp_design_name + '_All.txt', "rU") as table_all, \
        open(PATH_PEAKS + col_name + '/' + exp_design_name + '_' + col_name + '_Summary_All.txt', "w") as annot_index_file, \
        open(PATH_PEAKS + '/' + exp_design_name + '/' + exp_design_name + '_' + col_name + '_Summary.txt', "w") as annot_final_file:

        csv_table_all = csv.DictReader(table_all, delimiter = '\t')
        df_annot = pd.read_csv(m6a_utils.PATH_ANNOT + 'gencodeVM13/gencode.vM13.annotation.entrez.uniprot_clean.txt',index_col=0, sep='\t')
        headers = ['WindowId']
        list_biocond = exp_design.get_biocond_to_dataset('m6aExpDesign_' + exp_design_name)
        #for biocond in list_biocond:
        #    headers.append(biocond)
        list_data = exp_design.get_data_list('m6aExpDesign_' + exp_design_name)
        for dataset in list_data:
            headers.append(dataset)
        headers.extend(['WindowId','Motif','Relative_pos','Ref_Peaks','Nb_ref_peaks','Classification'])
        headers.extend(['chromo_window','begin_window','end_window','strand_window','type_transcript','index_window'])
        headers.append('Transcript_ID')
        headers.extend(df_annot.columns)
        annot_index_file.write('\t'.join(headers)+'\n')
        annot_final_file.write('\t'.join(headers) + '\n')
        for row in csv_table_all:
            window_id = row['Peak_id']
            peak = HTSeq.GenomicInterval(row['chr'], int(row['start']), int(row['end']), ".")
            transcript_id = ''
            for iv, value in transcript_list[peak].steps():
                if type(value) is HTSeq.GenomicFeature:
                    transcript_id = value.attr['transcript_id']
                    if transcript_id in df_annot.index:
                        transcript_annot = df_annot.loc[transcript_id]
                        chr = df_annot.loc[transcript_id]['chr']
                        begin_tr = int(df_annot.loc[transcript_id]['begin'])
                        end_tr = int(df_annot.loc[transcript_id]['end'])
                        strand_tr = df_annot.loc[transcript_id]['strand']
                        transcript_iv = HTSeq.GenomicInterval(chr, begin_tr, end_tr, strand_tr)
                    else:
                        transcript_id = ''

            # Calculate motif presence score
            sequence_score = 0
            sequence = mouse_seq[peak.chrom][peak.start:peak.end].seq
            for motif in m6a_utils.MOTIF_METH:
                if motif in sequence:
                    sequence_score += m6a_utils.MOTIF_METH[motif]

            # relative position on transcript
            relative_pos = 0.5
            if transcript_id != '':
                diff_start = float(peak.start + peak.length / 2) - transcript_iv.start
                if df_annot['strand'][transcript_id] == '-':
                    diff_start = float(peak.end + peak.length / 2) - transcript_iv.end
                    diff_start = - diff_start

                relative_pos = diff_start / transcript_iv.length

            # lengthTranscript = math.fabs(float(df_annot['begin'][values[0]]) - float(df_annot['end'][values[0]]))
            # diffStart = float(begin + length / 2) - df_annot['begin'][values[0]]
            # if strand == '-':
            #     diffStart = float(begin + length / 2) - df_annot['end'][values[0]]
            #     diffStart = - diffStart
            # relative_pos = diffStart / lengthTranscript

            # Look at overlapping refpeaks
            ref_peaks = ''
            for iv, value in ref_peaks_list[peak].steps():
                if len(value):
                    ref_peaks = ref_peaks + str(value) + ';;'
                    # print(ref_peaks)
            nb_ref_peaks = len(ref_peaks.split(';;')) - 1

            # apply classification
            classification = 0
            if (relative_pos < 0.3) or (relative_pos > 0.7):
                if sequence_score > 1:
                    # if nb_ref_peaks > 0:
                    classification = 1

            new_row = [window_id]
            #for biocond in list_biocond:
            #    new_row.append(row[biocond])
            for dataset in list_data:
                new_row.append(row[dataset])
            new_row.extend([window_id, str(sequence_score), str(relative_pos), ref_peaks, str(nb_ref_peaks), classification,
                   peak.chrom, str(peak.start), str(peak.end), str(peak.length)])
            if transcript_id != '':
                new_row.extend(['protein_coding', '1', transcript_id])
                for i in range(0, len(df_annot.columns)):
                    header = df_annot.columns[i]
                    if header == 'UniprotIDs':
                        uniprot = df_annot[header][transcript_id]
                        #print(df_annot[header][values[0]].isnull())
                        #print(type(uniprot))
                        if not uniprot == 'None' and not type(uniprot) == numpy.float:
                            uniprot = df_annot[header][transcript_id].split(';')[0]
                            new_row.append(uniprot)
                        else:
                            new_row.append('none')
                    else:
                        new_row.append(df_annot[header][transcript_id])
                annot_final_file.write('\t'.join([str(i) for i in new_row]) + '\n')
            annot_index_file.write('\t'.join([str(i) for i in new_row]) + '\n')
예제 #41
0
    """this is to cut promoter into totalbins bins and count the coverage in each bin"""
    bins = numpy.linspace(promoter.start, promoter.end, totalbins + 1)
    for i in range(totalbins):
        bin_range = HTSeq.GenomicInterval(promoter.chrom, int(bins[i]),
                                          int(bins[i + 1]), '.')
        hm_list.append(
            int(
                sum(
                    numpy.fromiter(coverage[bin_range],
                                   dtype='i',
                                   count=(int(bins[i + 1]) - int(bins[i])))) /
                (reads / 1e6)))
    return hm_list


coverage = HTSeq.GenomicArray('auto', stranded=False, typecode='i')
bedfile = open(sys.argv[1])
reads = 0
if sys.argv[6] == 'fragment':
    while True:
        line1 = bedfile.readline().rstrip()
        if not line1:
            break
        reads += 1
        line2 = bedfile.readline().rstrip()
        items1 = line1.split()
        items2 = line2.split()
        if items1[0] == items2[0]:
            chr = items1[0]
            start = min(int(items1[1]), int(items2[1]))
            end = max(int(items1[2]), int(items2[2]))
예제 #42
0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one
#Written by Simon Anders
import sys, re
import HTSeq

insam = HTSeq.SAM_Reader(sys.stdin)

# Go through all reads, with their alignments bundled up:
for bundle in HTSeq.bundle_multiple_alignments(insam):
    bestAlmt = None
    # Go through all alignments of a given read, looking
    # for the one with the best alignment score
    for almt in bundle:
        if bestAlmt is None:
            bestAlmt = almt
        elif almt.aQual > bestAlmt.aQual:
            bestAlmt = almt
        elif almt.aQual == bestAlmt:
            # If there are more than one best alignment,
            # better skip the read
            bestAlmt = None
    if bestAlmt is not None:
        # Change the NH field to 1 and print the line
        print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line)

#call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
예제 #43
0
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      

   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               raise ValueError, ( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               raise ValueError, ( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   if samtype == "sam":
      SAM_or_BAM_Reader = HTSeq.SAM_Reader
   elif samtype == "bam":
      SAM_or_BAM_Reader = HTSeq.BAM_Reader
   else:
      raise ValueError, "Unknown input format %s specified." % samtype

   try:
      if sam_filename != "-":
         read_seq_file = SAM_or_BAM_Reader( sam_filename )
         read_seq = read_seq_file
         first_read = iter(read_seq).next()
      else:
         read_seq_file = SAM_or_BAM_Reader( sys.stdin )
         read_seq_iter = iter( read_seq_file )
         first_read = read_seq_iter.next()
         read_seq = itertools.chain( [ first_read ], read_seq_iter )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
      raise

   try:
      if pe_mode:
         if order == "name":
            read_seq = HTSeq.pair_SAM_alignments( read_seq )
         elif order == "pos":
            read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
         else:
            raise ValueError, "Illegal order specified."
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         if i > 0 and i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )

         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "__not_aligned" )
               continue
            try:
               if r.optional_field( "NH" ) > 1:
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "__not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
                  nonunique += 1
                  write_to_samout( r, "__alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "__too_low_aQual" )
               continue         
         
         try:
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "__no_feature" )
               empty += 1
            elif len( fs ) > 1:
               write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
               write_to_samout( r, list(fs)[0] )
               counts[ list(fs)[0] ] += 1
         except UnknownChrom:
            write_to_samout( r, "__no_feature" )
            empty += 1

   except:
      sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()

   for fn in sorted( counts.keys() ):
      print "%s\t%d" % ( fn, counts[fn] )
   print "__no_feature\t%d" % empty
   print "__ambiguous\t%d" % ambiguous
   print "__too_low_aQual\t%d" % lowqual
   print "__not_aligned\t%d" % notaligned
   print "__alignment_not_unique\t%d" % nonunique
예제 #44
0
def centipede_footprint(bed_file,
                        bam_file,
                        sites,
                        sample_name,
                        plots_dir,
                        fragmentsize=1,
                        orientation=True,
                        duplicates=True,
                        strand_specific=True):
    """
    Gets read coverage in genomic intervals. Passes coverage to centipede_call_footprints and returns posterior probabilities.

    :param bed_file: Bed file.
    :type bed_file: str
    :param bam: HTSeq.BAM_Reader object, must be sorted and indexed with .bai file.
    :type bam: HTSeq.BAM_Reader
    :type fragmentsize: int
    :type stranded: bool
    :type duplicates: bool
    :returns: OrderedDict with regionName:numpy.array(coverage)
    :rtype: collections.OrderedDict
    """
    import pybedtools
    import os
    import HTSeq
    import numpy as np

    # read in bedfile
    motifs = pybedtools.BedTool(bed_file)
    # get motif name
    motif_name = os.path.basename(bed_file.split(".")[0])
    # get motif length (length of first interval)
    motif_length = motifs[0].length

    # convert intervals to HTSeq.GenomicInterval
    intervals = map(bedtools_interval_to_genomic_interval, motifs)

    # Handle bam file
    bam = HTSeq.BAM_Reader(bam_file)

    # exclude bad chroms
    chroms_exclude = ['chrM', 'chrX', 'chrY']

    # get dimensions of matrix to store profiles of Tn5 transposition
    n = len(intervals)
    m = intervals[0].length

    # create empty matrix
    if not strand_specific:
        coverage = np.zeros((n, m), dtype=np.float64)
    else:
        # if "strand_specific", get signal for both strands independently, but concatenated
        coverage = np.zeros((n, m * 2), dtype=np.float64)

    # Loop through intervals, get coverage, increment matrix count
    for i, feature in enumerate(intervals):
        # counter just to track
        if i % 1000 == 0:
            print(n - i)

        # Check if feature is not in bad chromosomes
        if feature.chrom in chroms_exclude:
            continue

        # Fetch alignments in interval
        for aln in bam[feature]:
            # check it's aligned
            if not aln.aligned:
                continue

            # check if duplicate
            if not duplicates and aln.pcr_or_optical_duplicate:
                continue

            aln.iv.length = fragmentsize  # adjust reads to specified size

            # get position relative to window if required (motif-oriented)
            if orientation:
                if feature.strand == "+" or feature.strand == ".":
                    start_in_window = aln.iv.start - feature.start - 1
                    end_in_window = aln.iv.end - feature.start - 1
                else:
                    start_in_window = feature.length - abs(feature.start -
                                                           aln.iv.end) - 1
                    end_in_window = feature.length - abs(feature.start -
                                                         aln.iv.start) - 1
            else:
                start_in_window = aln.iv.start - feature.start - 1
                end_in_window = aln.iv.end - feature.start - 1

            # check fragment is within window; this is because of fragmentsize adjustment
            if start_in_window < 0 or end_in_window > feature.length:
                continue

            # add +1 to all positions overlapped by read within window
            if not strand_specific:
                coverage[i, start_in_window:end_in_window] += 1
            else:
                if aln.iv.strand == "+":
                    coverage[i, start_in_window:end_in_window] += 1
                else:
                    coverage[i, m + start_in_window:m + end_in_window] += 1
    # Call footprints, get posterior probabilities
    try:
        probs = centipede_call_footprints(
            coverage, np.ones([len(coverage), 1]), motif_length,
            os.path.join(plots_dir, sample_name + "." + motif_name + ".pdf"))
        if len(probs) != len(coverage):
            probs = np.zeros(len(coverage))
    except:
        # if error, return zeros
        probs = np.zeros(len(coverage))
    return probs
예제 #45
0
def modifHTSeq(bam_filename, gff_filename, out_file, overlap_mode,
               feature_type, id_attribute, minaqual, exclude_start_distance,
               exclude_stop_distance, min_len, max_len):
    #feature GenomicArrayOfSets
    features = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    counts = {}
    start_codon_sites = {}
    stop_codon_sites = {}
    #GTF
    gff = HTSeq.GFF_Reader(gff_filename, end_included=True)
    i = 0
    for f in gff:
        if f.type == feature_type:
            if id_attribute in f.attr:  #the same to the f.attr.keys()
                feature_id = f.attr[
                    id_attribute]  # f.attr will return the 9-th colum of the input gtf file as {}
            else:
                feature_id = f.attr[
                    'gene_id']  #in the gtf file of Rat, there are some CDS/exon dont have gene_name ,but every items have gene_id
            features[
                f.
                iv] += feature_id  #label the chrmosome with gene_name, if dont have gene_name,replaced by gene_id
            #counts[ f.attr[ id_attribute ] ] = 0 #only counts reads for genes with id_attribute, so cant repaced by counts[ feature_id ] = 0
            counts[feature_id] = 0
        ### if there are multiple TIS, use the most 5' end start codon and the most 3' end stop codon
        if f.type == "start_codon":
            if id_attribute in f.attr:
                gname = f.attr[id_attribute]
            if gname not in start_codon_sites:
                start_codon_sites[gname] = f.iv.start_d
            else:
                if f.iv.strand == "+":
                    start_codon_sites[gname] = min(f.iv.start_d,
                                                   start_codon_sites[gname])
                else:
                    start_codon_sites[gname] = max(f.iv.start_d,
                                                   start_codon_sites[gname])
        #
        if f.type == "stop_codon":
            if id_attribute in f.attr:
                gname = f.attr[id_attribute]
            if gname not in stop_codon_sites:
                stop_codon_sites[gname] = f.iv.end_d
            else:
                if f.iv.strand == "+":
                    stop_codon_sites[gname] = max(f.iv.end_d,
                                                  stop_codon_sites[gname])
                else:
                    stop_codon_sites[gname] = min(f.iv.end_d,
                                                  stop_codon_sites[gname])
        i += 1
        if i % 100000 == 0:
            sys.stderr.write("%d GFF lines processed.\n" % i)
    #bam
    read_seq = HTSeq.BAM_Reader(bam_filename)
    #counts
    empty = 0
    ambiguous = 0
    notaligned = 0
    lowqual = 0
    nonunique = 0
    i = 0
    for r in read_seq:
        if i > 0 and i % 100000 == 0:
            sys.stderr.write("%d SAM alignment record processed.\n" % i)
        i += 1
        if not r.aligned:
            notaligned += 1
            continue
        if r.optional_field("NH") > 1:
            nonunique += 1
            continue
        if r.aQual < minaqual:
            lowqual += 1
            continue
        ###
        if len(r.read.seq) < min_len or len(r.read.seq) > max_len:
            continue
        iv_seq = (co.ref_iv for co in r.cigar
                  if co.type == "M" and co.size > 0)
        if overlap_mode == "union":
            fs = set()
            for iv in iv_seq:
                for iv2, fs2 in features[iv].steps():
                    fs = fs.union(fs2)
        elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
            fs = None
            for iv in iv_seq:
                for iv2, fs2 in features[iv].steps():
                    if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                            fs = fs2.copy()
                        else:
                            fs = fs.intersection(fs2)
        else:
            sys.exit("Illegal overlap mode.")
        if fs is None or len(fs) == 0:
            empty += 1
        elif len(fs) > 1:
            ambiguous += 1
        else:
            try:  #some genes may dont have start or stop codon
                if abs(start_codon_sites[list(fs)[0]] -
                       r.iv.start_d) < exclude_start_distance:
                    continue
                elif abs(r.iv.end_d - stop_codon_sites[list(fs)[0]]
                         ) < exclude_stop_distance:
                    continue
                else:
                    counts[list(fs)[0]] += 1
            except:
                counts[list(fs)[0]] += 1
    #output
    with open(out_file, "w") as fout:
        fout.write("%s\t%s\n" % (id_attribute.strip(), "count"))
        for fn in sorted(counts.keys()):
            fout.write("%s\t%s\n" % (fn, counts[fn]))
        fout.write("__no_feature\t%d\n" % empty)
        fout.write("__ambiguous\t%d\n" % ambiguous)
        fout.write("__too_low_aQual\t%d\n" % lowqual)
        fout.write("__not_aligned\t%d\n" % notaligned)
        fout.write("__alignment_not_unique\t%d\n" % nonunique)
def sciRNAseq_count(sample, input_folder, exons, genes, gene_end, gene_annotat,
                    sample_ID):
    input_sam = input_folder + "/" + sample + ".sam"
    report = input_folder + "/" + sample + ".report"
    count_output = input_folder + "/" + sample + ".count"

    counts = collections.Counter()
    sam_file = input_sam
    almnt_file = HTSeq.SAM_Reader(sam_file)
    sam_name = sample
    cell_ID = sample_ID.index(sample) + 1

    perfect_inter_exon = 0
    nearest_inter_exon = 0
    perfect_combine_exon = 0
    nearest_combine_exon = 0
    perfect_inter_gene = 0
    nearest_inter_gene = 0
    perfect_combine_gene = 0
    nearest_combine_gene = 0

    print("Start read the input file: " + sam_file + "....")

    for alnmt in almnt_file:
        #print alnmt
        if not alnmt.aligned:
            counts["_unmapped"] += 1
            continue

        if alnmt.iv.chrom not in genes.chrom_vectors:
            counts["_unmapped"] += 1
            continue

        # First check the intersectin with exons
        gene_id_intersect = set()
        gene_id_combine = set()
        inter_count = 0
        for cigop in alnmt.cigar:
            if cigop.type != "M":
                continue

            for iv, val in exons[cigop.ref_iv].steps():
                #print iv, val
                gene_id_combine |= val
                if inter_count == 0:
                    gene_id_intersect |= val
                    inter_count += 1
                else:
                    gene_id_intersect &= val
                #print "intersect set:", gene_id_intersect
                #print "combine set:", gene_id_combine
        # first check the intersection set
        if len(gene_id_intersect) == 1:
            gene_id = list(gene_id_intersect)[0]
            counts[gene_id] += 1
            perfect_inter_exon += 1
        elif len(gene_id_intersect) > 1:
            gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect,
                                        gene_end)
            counts[gene_id] += 1
            nearest_inter_exon += 1
        else:
            # if there no intersection match, then find the union sets
            if len(gene_id_combine) == 1:
                gene_id = list(gene_id_combine)[0]
                counts[gene_id] += 1
                perfect_combine_exon += 1
            elif len(gene_id_combine) > 1:
                gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine,
                                            gene_end)
                counts[gene_id] += 1
                nearest_combine_exon += 1
            else:
                # if there is no intersection match or union match, then search for genes to find the intronic match
                gene_id_intersect = set()
                gene_id_combine = set()
                inter_count = 0
                for cigop in alnmt.cigar:
                    if cigop.type != "M":
                        continue
                    for iv, val in genes[cigop.ref_iv].steps():
                        gene_id_combine |= val
                        if inter_count == 0:
                            gene_id_intersect |= val
                            inter_count += 1
                        else:
                            gene_id_intersect &= val

                if len(gene_id_intersect) == 1:
                    gene_id = list(gene_id_intersect)[0] + "_intron"
                    counts[gene_id] += 1
                    perfect_inter_gene += 1

                elif len(gene_id_intersect) > 1:
                    gene_id = find_nearest_gene(alnmt.iv.end_d,
                                                gene_id_intersect,
                                                gene_end) + "_intron"
                    counts[gene_id] += 1
                    nearest_inter_gene += 1

                else:
                    # if there no intersection match, then find the union sets
                    if len(gene_id_combine) == 1:
                        gene_id = list(gene_id_combine)[0] + "_intron"
                        counts[gene_id] += 1
                        perfect_combine_gene += 1

                    elif len(gene_id_combine) > 1:
                        gene_id = find_nearest_gene(alnmt.iv.end_d,
                                                    gene_id_combine,
                                                    gene_end) + "_intron"
                        counts[gene_id] += 1
                        nearest_combine_gene += 1

                    else:
                        counts["_no_feature"] += 1

    print("File name: ", sam_file)
    print("1: Perfect intersect exon match: ", perfect_inter_exon)
    print("2: Nearest intersect exon match: ", nearest_inter_exon)
    print("3: Perfect combine exon match: ", perfect_combine_exon)
    print("4: Nearest combine exon match: ", nearest_combine_exon)
    print("5: Perfect intersect gene match: ", perfect_inter_gene)
    print("6: Nearest intersect gene match: ", nearest_inter_gene)
    print("7: Perfect combine gene match: ", perfect_combine_gene)
    print("8: Nearest combine gene match: ", nearest_combine_gene)
    print("9: ambiguous match for exons: ", counts["_ambiguous"])
    print("10: ambiguous match for genes: ", counts["_ambiguous_intron"])
    print("11: No match: ", counts["_no_feature"])
    print("Sam file analysis finished~")

    with open(report, 'w') as report:
        report.write("1" + "," + str(cell_ID) + "," + str(perfect_inter_exon) +
                     "\n")
        report.write("2" + "," + str(cell_ID) + "," + str(nearest_inter_exon) +
                     "\n")
        report.write("3" + "," + str(cell_ID) + "," +
                     str(perfect_combine_exon) + "\n")
        report.write("4" + "," + str(cell_ID) + "," +
                     str(nearest_combine_exon) + "\n")
        report.write("5" + "," + str(cell_ID) + "," + str(perfect_inter_gene) +
                     "\n")
        report.write("6" + "," + str(cell_ID) + "," + str(nearest_inter_gene) +
                     "\n")
        report.write("7" + "," + str(cell_ID) + "," +
                     str(perfect_combine_gene) + "\n")
        report.write("8" + "," + str(cell_ID) + "," +
                     str(nearest_combine_gene) + "\n")
        report.write("9" + "," + str(cell_ID) + "," +
                     str(counts["_ambiguous"]) + "\n")
        report.write("10" + "," + str(cell_ID) + "," +
                     str(counts["_ambiguous_intron"]) + "\n")
        report.write("11" + "," + str(cell_ID) + "," +
                     str(counts["_no_feature"]) + "\n")

    with open(count_output, 'w') as count_output:
        for gene in counts:
            if (gene in [
                    "_unmapped", "_ambiguous", "_ambiguous_intron",
                    "_no_feature"
            ]):
                continue
            else:
                line = str(gene_annotat.loc[gene, 4]) + "," + str(
                    cell_ID) + "," + str(counts[gene]) + "\n"
                count_output.write(line)
    return 0
예제 #47
0
def count_reads_in_features( sam_filename, gff_filename, stranded, 
      overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, allow_ambiguous, allow_nonunique ):
      
   def write_to_samout( r, assignment ):
      if samoutfile is None:
         return
      if not pe_mode:
         r = (r,)
      for read in r:
         if read is not None:
            samoutfile.write( read.original_sam_line.rstrip() + 
               "\tXF:Z:" + assignment + "\n" )
      
   if quiet:
      warnings.filterwarnings( action="ignore", module="HTSeq" ) 
      
   if samout != "":
      samoutfile = open( samout, "w" )
   else:
      samoutfile = None
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
   features_dict = defaultdict(list)
   counts = {}

   # Try to open samfile to fail early in case it is not there
   if sam_filename != "-":
      open( sam_filename ).close()
      
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               sys.exit( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               sys.exit( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
            features_dict[ f.attr[ id_attribute ] ].append(f)
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
   
   sys.stderr.write( "Sorting exons from GFF file.\n" )
   for key, value in features_dict.items():
   		if features_dict[key][0].iv.strand == "-":
   				features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=True)
   		else:
   				features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=False)
      
   if len( counts ) == 0 and not quiet:
      sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
   
   try:
      if sam_filename != "-":
         read_seq = HTSeq.SAM_Reader( sam_filename )
         first_read = iter(read_seq).next()
      else:
         read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
         first_read = read_seq.next()
         read_seq = itertools.chain( [ first_read ], read_seq )
      pe_mode = first_read.paired_end
   except:
      sys.stderr.write( "Error occured when reading first line of sam file.\n" )
      raise

   try:
      if pe_mode:
         read_seq_pe_file = read_seq
         read_seq = HTSeq.pair_SAM_alignments( read_seq )
      empty = 0
      ambiguous = 0
      notaligned = 0
      lowqual = 0
      nonunique = 0
      i = 0   
      for r in read_seq:
         i += 1
         if not pe_mode:
            if not r.aligned:
               notaligned += 1
               write_to_samout( r, "not_aligned" )
               continue
            try:
               if ((allow_nonunique == "no") and (r.optional_field( "NH" ) > 1)):
                  write_to_samout( r, "alignment_not_unique" )
                  nonunique += 1
                  continue
            except KeyError:
               pass
            if r.aQual < minaqual:
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue
            if stranded != "reverse":
               iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" )
            else:
               iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" )            
         else:
            if r[0] is not None and r[0].aligned:
               if stranded != "reverse":
                  iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" )
               else:
                  iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" )
            else:
               iv_seq = tuple()
            if r[1] is not None and r[1].aligned:            
               if stranded != "reverse":
                  iv_seq = itertools.chain( iv_seq, 
                     ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" ) )
               else:
                  iv_seq = itertools.chain( iv_seq, 
                     ( co.ref_iv for co in r[1].cigar if co.type == "M" ) )
            else:
               if ( r[0] is None ) or not ( r[0].aligned ):
                  write_to_samout( r, "not_aligned" )
                  notaligned += 1
                  continue         
            try:
               if (allow_nonunique == "no") and (( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
                     ( r[1] is not None and r[1].optional_field( "NH" ) > 1 )):
                  nonunique += 1
                  write_to_samout( r, "alignment_not_unique" )
                  continue
            except KeyError:
               pass
            if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
               lowqual += 1
               write_to_samout( r, "too_low_aQual" )
               continue         
         
         try:
            iv_seq = list(iv_seq)
            if overlap_mode == "union":
               fs = set()
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     fs = fs.union( fs2 )
            elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
               fs = None
               for iv in iv_seq:
                  if iv.chrom not in features.chrom_vectors:
                     raise UnknownChrom
                  for iv2, fs2 in features[ iv ].steps():
                     if len(fs2) > 0 or overlap_mode == "intersection-strict":
                        if fs is None:
                           fs = fs2.copy()
                        else:
                           fs = fs.intersection( fs2 )
            else:
               sys.exit( "Illegal overlap mode." )
#            sys.stderr.write( "fs = %s with len = %d allow_ambiguous=%s\n" % (fs, len(fs), allow_ambiguous) )
            if fs is None or len( fs ) == 0:
               write_to_samout( r, "no_feature" )
               empty += 1
            elif ((len( fs ) > 1) and (allow_ambiguous == "no")):
               write_to_samout( r, "ambiguous[" + '+'.join( fs ) + "]" )
               ambiguous += 1
            else:
#               write_to_samout( r, list(fs)[0] )
#               sys.stderr.write( "iv_seq is now %s\n" % iv_seq )
#               sys.exit ("stopping for now" )
               for iv in iv_seq:
#               		sys.stderr.write( "iv in iv_seq is %s and getting %s\n" % (iv, features[iv]) )
               		for iv2, fs2 in features[ iv ].steps():
#						         sys.stderr.write( "iv2 = %s and fs2 = %s\n" % (iv2, fs2) )
						         if len(fs2) == 0:
						         		continue
						         for fsi in fs2:
#						         		sys.stderr.write( "fsi = %s\n" % fsi )
						         		offset = 0
#						         		write_to_samout( r, fsi[0] )
						         		for exon in features_dict[ fsi ]:
						         				if ((iv2.start >= exon.iv.start) and (iv2.end <= exon.iv.end)):
#						         						sys.stderr.write("found matching exon %s\n" % exon)
						         						if (exon.iv.strand == "+"):
						         								offset += (iv2.start - exon.iv.start)
						         						else:
						         								offset += (exon.iv.end - iv2.end)
#						         								sys.stderr.write("matching exon new offset %d\n" % offset)
						         						break
						         				else:
						         						offset += (exon.iv.end - exon.iv.start)
#						         						sys.stderr.write( "skipping exon %s - new offset %d\n" % (exon, offset) )
#						         				sys.stderr.write( "mapping read %s to offset %d\n" % (r, offset) )
#						         				sys.stderr.write( "dir(r) is %s" % "\t".join(list(dir(r))) )
						         		if pe_mode:
						         			rname = r[0].read.name if r[0] is not None else r[1].read.name
						         		else:
						         			rname = r.read.name
						         		print "%s\t%d\t%d\t%s" % (fsi, offset, offset + (iv2.end-iv2.start-1), rname) # output is 0-based, inclusive on both ends
         except UnknownChrom:
            if not pe_mode:
               rr = r 
            else: 
               rr = r[0] if r[0] is not None else r[1]
            empty += 1
            #if not quiet:
            #   sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " +
            #      "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % 
            #      ( rr.read.name, iv.chrom ) )

         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )

   except:
      if not pe_mode:
         sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
      else:
         sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
      raise

   if not quiet:
      sys.stderr.write( "%d sam %s processed.\n" % ( i, "lines " if not pe_mode else "line pairs" ) )
         
   if samoutfile is not None:
      samoutfile.close()
예제 #48
0
def intron_retention(outfile, ref_t):
    gff_file = outfile + "_addedintron.gff3"
    talnm_file = glob.glob(outfile + "_transcriptome_alnm.sam")[0]
    galnm_file = glob.glob(outfile + "_genome_alnm.sam")[0]

    #read intron information from GFF file
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Reading intron coordinates from GFF file\n")
    gff_features = HTSeq.GFF_Reader(gff_file, end_included=True)
    features = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    dict_intron_info = {}
    for feature in gff_features:
        if "Parent" in feature.attr:
            info = feature.attr["Parent"].split(':')
            if info[0] == "transcript":
                feature_id = info[1]
                if feature_id not in dict_intron_info:
                    dict_intron_info[feature_id] = []
        if feature.type == "intron":
            # feature_id_2 = feature.name.split(':')[1] #feature_id_2 is same as feature_id above if feature is intron, I was just checking and testing it. then removed this line.
            features[feature.iv] += feature_id
            dict_intron_info[feature_id].append((feature.iv.start, feature.iv.end, feature.iv.length))

    #read primary genome alignment for each read
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary genome alignment for each read\n")
    dict_g_alnm = {}
    sam_reader = HTSeq.SAM_Reader
    g_alignments = sam_reader(galnm_file)
    for alnm in g_alignments:
        qname = alnm.read.name
        if alnm.aligned and not alnm.not_primary_alignment and not alnm.supplementary:
            dict_g_alnm[qname] = alnm
        if alnm.supplementary and qname in dict_g_alnm:
            del dict_g_alnm[qname]  # delete chimeric reads

    #read primary transcriptome alignment for each read
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read primary transcriptome alignment for each read\n")
    dict_t_alnm = {}
    sam_reader = HTSeq.SAM_Reader
    t_alignments = sam_reader(talnm_file)
    for alnm in t_alignments:
        qname = alnm.read.name
        if alnm.aligned and not alnm.not_primary_alignment and not alnm.supplementary:
            dict_t_alnm[qname] = alnm
        if alnm.supplementary and qname in dict_t_alnm:
            del dict_t_alnm[qname]  # delete chimeric reads


    #count the length of Intron retention events
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Calculating probabilites for each intron retention event\n")
    dict_first_intron_state = {False: 0, True: 0}
    dict_states = {(False, False): 0, (False, True): 0, (True, False): 0, (True, True): 0}
    for qname in dict_g_alnm:
        galnm = dict_g_alnm[qname]
        if qname in dict_t_alnm:
            talnm = dict_t_alnm[qname]
            primary_trx = talnm.iv.chrom.split(".")[0]
            if stranded != "reverse":
                iv_seq = (co.ref_iv for co in galnm.cigar if (co.type in ('M', '=', 'X', 'D') and co.size > 0))
                #iv_seq = (co.ref_iv for co in galnm.cigar if co.type in ('M', 'D') and co.size > 0) #tested. test the above cases too to make sure about it.
            else:
                iv_seq = (invert_strand(co.ref_iv) for co in galnm.cigar if (co.type in ('M', '=', 'X', 'D') and co.size > 0))

            list_IR_positions = []
            pos = []
            ir_info = False
            try:
                length_IR = 0
                for iv in iv_seq:
                    for iv2, fs2 in features[iv].steps():
                        if fs2.intersection(set([primary_trx])):
                            length_IR += iv2.length
                            pos.append(iv2.start)
                            pos.append(iv2.end)
                        else:
                            if length_IR != 0:
                                for intron in dict_intron_info[primary_trx]:
                                    if length_IR == intron[2]:
                                        list_IR_positions.append(min(pos))
                                        list_IR_positions.append(max(pos))
                                        ir_info = True
                                length_IR = 0
                                pos = []
            except UnknownChrom:
                ir_info = False
                pass

            if ir_info == False:
                if primary_trx in dict_intron_info:
                    if len(dict_intron_info[primary_trx]) >= 1: #if there is a intron
                        dict_first_intron_state[False] += 1
                        for i in range(1, len(dict_intron_info[primary_trx])):
                            dict_states[(False, False)] += 1
            else:
                # Now, go over all introns and check with the IR events
                # First we need to determine the state of first intron:
                first_intron = dict_intron_info[primary_trx][0]
                first_intron_spos = first_intron[0]
                first_intron_epos = first_intron[1]
                flag = False
                for IR_pos in list_IR_positions:
                    if first_intron_spos <= IR_pos <= first_intron_epos:
                        flag = True
                        break
                if flag == True:
                    dict_first_intron_state[True] += 1
                    previous_state = True
                else:
                    dict_first_intron_state[False] += 1
                    previous_state = False

                # Then we will go over other introns:
                for i in range (1, len(dict_intron_info[primary_trx])):
                    intron = dict_intron_info[primary_trx][i]
                    current_state = False
                    intron_spos = intron[0]
                    intron_epos = intron[1]
                    for IR_pos in list_IR_positions:
                        if intron_spos <= IR_pos <= intron_epos:
                            current_state = True
                            break
                    #print(intron_spos, intron_epos, previous_state, current_state)
                    dict_states[(previous_state, current_state)] += 1
                    previous_state = current_state

    sum_first_introns = dict_first_intron_state[True] + dict_first_intron_state[False]
    sum_for_noIR = dict_states[(False, False)] + dict_states[(False, True)]
    sum_for_IR = dict_states[(True, False)] + dict_states[(True, True)]

    fout = open(outfile + "_IR_markov_model", 'w')
    fout.write("succedent\tno_IR\tIR\n")

    fout.write("start\t" + str(round(dict_first_intron_state[False] / float(sum_first_introns), 4)) + "\t" \
               + str(round(dict_first_intron_state[True] / float(sum_first_introns), 4)) + "\n")

    fout.write("no_IR\t" + str(round(dict_states[(False, False)] / float(sum_for_noIR), 4)) + "\t" \
               + str(round(dict_states[(False, True)] / float(sum_for_noIR), 4)) + "\n")

    fout.write("IR\t" + str(round(dict_states[(True, False)] / float(sum_for_IR), 4)) + "\t" \
               + str(round(dict_states[(True, True)] / float(sum_for_IR), 4)) + "\n")

    fout.close()
예제 #49
0
    elif file[-13:-11] == 'A5':
        A5list = [x.split('\t')[1] for x in open(file).readlines()[1:]]
    elif file[-13:-11] == 'MX':
        MXlist = [x.split('\t')[1] for x in open(file).readlines()[1:]]
    elif file[-13:-11] == 'AL':
        ALlist = [x.split('\t')[1] for x in open(file).readlines()[1:]]
    elif file[-13:-11] == 'AF':
        AFlist = [x.split('\t')[1] for x in open(file).readlines()[1:]]

GENE = Set(["gene", "pseudogene", "transposable_element_gene"])
EXON = Set(["exon", "pseudogenic_exon"])

num_lines = sum(1 for line in open(name_gff))

file_gff = open(name_gff, 'r')
gff_file = HTSeq.GFF_Reader(file_gff)

count = 0
transcript = set()
lines = 0
gene_dict = {}

for feature in gff_file:
    lines += 1
    if feature.type in GENE or lines == num_lines:
        if len(transcript) == 2:
            count += 1
            gene_dict[gene_cand.attr["ID"]] = len(transcript)
        gene_cand = feature
        transcript.clear()
    if feature.type in EXON:
예제 #50
0
def Get_IPAevent(input_tuple):
    label,all_bamfiles = input_tuple
    curr_label_all_gas = []
    curr_label_all_ga = []
    curr_label_all_gene_count = []
    IPA_result = []
    min_count = 30
    for bamfile in all_bamfiles:
        bam_reader = HTSeq.BAM_Reader(bamfile)
        gas,ga,gene_count = Get_label_information(label,annot,bam_reader)
        curr_label_all_gas.append(gas)
        curr_label_all_ga.append(ga)
        curr_label_all_gene_count.append(gene_count)
    for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]:
        if feature == "intron" and int(length)>250:
            intron_start = start
            intron_end = end
            end_value = 15
            index_list = [index for index,gene_count in enumerate(curr_label_all_gene_count) if gene_count[('intron',rank)]> min_count]
            if index_list != []:
                iv = HTSeq.GenomicInterval(chrom,intron_start,intron_end,strand)
                IPAtype = "Composite"
                curr_label_all_cov = []
                for index in index_list:
                    if strand == "-":
                        curr_label_all_cov.append(list(curr_label_all_ga[index][iv])[::-1])
                    else:
                        curr_label_all_cov.append(list(curr_label_all_ga[index][iv]))
                intron_region = chrom+":"+str(intron_start)+"-"+str(intron_end)
                skipend_dict_list = [Get_Skipend_dict(intron_region,bamfile,strand) for bamfile in all_bamfiles]
                for index,skipend_dict in enumerate(skipend_dict_list):
                    for key,value in skipend_dict.items():
                        if int(start)+50 < int(key) < int(end)-50 and int(value) > 10:
                            if strand == "+":
                                skip_position = int(key)-int(start)
                            else:
                                skip_position = int(end)-int(key)
                            curr_label_all_cov = [cvg_region[skip_position:] for cvg_region in curr_label_all_cov]
                            IPAtype = "Skipped"
                            start = int(key)
                            end = int(key)
                            end_value = int(value)
                            break
                    else:
                        continue
                    break
                min_mseratio_list,min_mse_point_list = Get_min_mseratio_list(curr_label_all_cov)
                min_mseratio = min(min_mseratio_list)
                min_mseratio_index = min_mseratio_list.index(min_mseratio)
                if min_mseratio < 0.5:
                    min_mseratio_list_refine,min_mse_point_list_refine = Get_min_mseratio_list_refine(curr_label_all_cov,min_mse_point_list[min_mseratio_index])
                    min_mseratio_refine = min(min_mseratio_list_refine)
                    min_mseratio_index_refine = min_mseratio_list_refine.index(min_mseratio_refine)
                    IPA_point = int(min_mse_point_list_refine[min_mseratio_index_refine])
                    up_down_diff = max([np.mean(coverage[:IPA_point])-np.mean(coverage[IPA_point:]) for coverage in curr_label_all_cov])
                    upstream_cov = max([len(list(filter(lambda x:x>5,coverage[:IPA_point])))/IPA_point for coverage in curr_label_all_cov])
                    downstream_cov = np.mean([len(list(filter(lambda x:x>5,coverage[IPA_point:])))/(len(coverage)-IPA_point) for coverage in curr_label_all_cov])
                    if min_mseratio_refine < 0.5 and up_down_diff > 1 and upstream_cov > 0.8 and downstream_cov < 0.5:
                        if strand == "+":
                            IPA_location = int(start)+IPA_point
                            IPA_inf = chrom+":"+str(start)+"-"+str(IPA_location)
                        else:
                            IPA_location = int(end)-IPA_point
                            IPA_inf = chrom+":"+str(IPA_location)+"-"+str(end)
                        skipstart_dict = Get_Skipstart_dict(intron_region,all_bamfiles,strand)
                        for key,value in skipstart_dict.items():
                            if IPA_location-20<int(key)<IPA_location+20 and int(value) > end_value*0.8:
                                break
                        else:
                            intronPA_inf = label + ";"+feature + "_" + str(rank) + ";" + IPA_inf + ";" +  IPAtype
                            IPA_information = Get_IPAsite_IPUI((intronPA_inf,curr_label_all_ga,gas))
                            IPA_result.append(IPA_information)
    return IPA_result
예제 #51
0
def test_bam_inconsistent_mate():
    print('Test inconsistent BAM file')
    bamfile = HTSeq.BAM_Reader("example_data/inconsistent_mate.bam")
    for read in bamfile:
        pass
    print("Test passed")
예제 #52
0
try:
    import HTSeq
except ImportError:
    sys.stderr.write(
        "Could not import HTSeq. Please install the HTSeq Python framework\n")
    sys.stderr.write(
        "available from http://www-huber.embl.de/users/anders/HTSeq\n")
    sys.exit(1)

gtf_file = sys.argv[1]
out_file = sys.argv[2]

# Step 1: Store all exons with their gene and transcript ID
# in a GenomicArrayOfSets

exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)
for f in HTSeq.GFF_Reader(gtf_file):
    if f.type != "exon":
        continue
    f.attr['gene_id'] = f.attr['gene_id'].replace(":", "_")
    exons[f.iv] += (f.attr['gene_id'], f.attr['transcript_id'])

# Step 2: Form sets of overlapping genes

# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set
# contains IDs of genes that overlap, i.e., share bases (on the same strand).
# The keys of 'gene_sets' are the IDs of all genes, and each key refers to
# the set that contains the gene.
# Each gene set forms an 'aggregate gene'.

gene_sets = collections.defaultdict(lambda: set())
예제 #53
0
def split_by_barcode(initial_filename):
    outfiles = {}
    outfiles['CAAT'] = open('n2_sp_lane3_rt3.fastq', 'w')
    outfiles['AATA'] = open('n2_sp_lane3_rt15.fastq', 'w')
    outfiles['TTAA'] = open('n2_sp_lane3_rt16.fastq', 'w')
    missingf = open('no_recognized_barcode_lane3.fastq', 'w')
    skip = """
    # Lane 1:
    outfiles['GGTT'] = open('fbf1_sp_lane1_rt1.fastq', 'w')
    outfiles['TTGT'] = open('fbf2_sp_lane1_rt2.fastq', 'w')
    outfiles['CAAT'] = open('n2_oo_lane1_rt3.fastq', 'w')
    outfiles['CCGG'] = open('fbf1_sp_lane1_rt6.fastq', 'w')
    outfiles['TGGC'] = open('fbf1_sp_lane1_rt9.fastq', 'w')
    outfiles['CGGA'] = open('fbf2_sp_lane1_rt13.fastq', 'w')
    outfiles['GGCA'] = open('fbf2_sp_lane1_rt14.fastq', 'w')
    outfiles['AATA'] = open('n2_oo_lane1_rt15.fastq', 'w')
    outfiles['TTAA'] = open('n2_oo_lane1_rt16.fastq', 'w')
    missingf = open('no_recognized_barcode_lane1.fastq', 'w')
    # Lane 2:
    outfiles['GGTT'] = open('fbf1_oo_lane2_rt1.fastq', 'w')
    outfiles['TTGT'] = open('fbf2_oo_lane2_rt2.fastq', 'w')
    outfiles['CCGG'] = open('fbf1_oo_lane2_rt6.fastq', 'w')
    outfiles['TGGC'] = open('fbf1_oo_lane2_rt9.fastq', 'w')
    outfiles['CGGA'] = open('fbf2_oo_lane2_rt13.fastq', 'w')
    outfiles['GGCA'] = open('fbf2_oo_lane2_rt11.fastq', 'w')
    missingf = open('no_recognized_barcode_lane2.fastq', 'w')
    outfiles['TGGC'] = open('exp_fbf1_TGGC.fastq', 'w')
    outfiles['CGGA'] = open('exp_fbf1_CGGA.fastq', 'w')
    # There is an irregularity here. The GEO dataset indicates a GCCA/TCCG barcode
    # (the rc of the above) and a GGTT barcode (so we expect AACC).
    # What we use in the sp/oo is GGTT, though.
    outfiles['AACC'] = open('exp_fbf1_AACC.fastq', 'w')
    outfiles['GGTT'] = open('exp_fbf1_GGTT.fastq', 'w')

    outfiles['CCGG'] = open('fbf1_n2_CCGG.fastq', 'w')
    outfiles['TTGT'] = open('fbf1_n2_TTGT.fastq', 'w')
    outfiles['GGCA'] = open('fbf1_n2_GGCA.fastq', 'w')
    missingf = open('no_recognized_barcode_fbf1.fastq', 'w')

"""
    skip = """
    outfiles['TGGC'] = open('exp_fbf2_TGGC.fastq', 'w')
    outfiles['CGGA'] = open('exp_fbf2_CGGA.fastq', 'w')
    outfiles['AACC'] = open('exp_fbf2_AACC.fastq', 'w')
    outfiles['GGTT'] = open('exp_fbf2_GGTT.fastq', 'w')

    outfiles['CCGG'] = open('fbf2_n2_CCGG.fastq', 'w')
    outfiles['TTGT'] = open('fbf2_n2_TTGT.fastq', 'w')
    outfiles['GGCA'] = open('fbf2_n2_GGCA.fastq', 'w')
"""
    fastq_file = HTSeq.FastqReader(initial_filename)
    total_reads = 0
    for read in fastq_file:
        total_reads += 1
        if (not (total_reads % 100000)):
            print "Read: %i " % (total_reads)
        found = False
        for bc in outfiles.keys():
            if read.seq[3:7] == bc:
                # if(re.match('\w{3}' + bc + '.*', read.seq)):
                read.write_to_fastq_file(outfiles[bc])
                found = True
        if not found:
            read.write_to_fastq_file(missingf)
    for bc in outfiles:
        outfiles[bc].close()
def get_profile(coverage, chrom, start, end, strand):
    window = HTSeq.GenomicInterval(chrom, start, end, strand)
    wincvg = np.fromiter(coverage[window], dtype='i', count=end - start)
    return wincvg
예제 #55
0
def extract_np_arrays(cov_array, seqid, length):
    plus = cov_array[HTSeq.GenomicInterval(seqid, 0, length, "+")].array
    minus = cov_array[HTSeq.GenomicInterval(seqid, 0, length, "-")].array
    return plus, minus
def main(argv):
    parser = OptionParser()
    parser.add_option("-r",
                      "--chromsize",
                      action="store",
                      type="string",
                      dest="chromsize",
                      help="GRCh38 chromosome size file",
                      metavar="<str>")
    parser.add_option("-v",
                      "--variantfile",
                      action="store",
                      type="string",
                      dest="variantfile",
                      metavar="<file>",
                      help="the variant calls files in a specific format")
    parser.add_option("-o",
                      "--outdir",
                      action="store",
                      type="string",
                      dest="outdir",
                      metavar="<file>",
                      help="the directory to store the output files")

    (opt, args) = parser.parse_args(argv)
    if len(argv) < 6:
        parser.print_help()
        sys.exit(1)

    chrom_size_file = open(opt.chromsize, 'r')
    # Read chrom size information from the chrom_size_file.
    chrom_size = {}
    for line in chrom_size_file:
        pline = line.strip()
        sline = pline.split('\t')
        chrom_size[sline[1]] = int(sline[0])
    chrom_size_file.close()

    var_types_ga = VariantCallTabReader(opt.variantfile, chrom_size)[0]
    var_types_id = VariantCallTabReader(opt.variantfile, chrom_size)[1]
    for var_type in var_types_ga.keys():
        # Creat a 'Genomic Array' using HTSeq package
        ga = HTSeq.GenomicArray(chrom_size, stranded=False, typecode="i")
        nssd = HTSeq.GenomicArrayOfSets(chrom_size, stranded=False)

        variant_interval = var_types_ga[var_type]
        variant_id = var_types_id[var_type]

        # Get the count of variant calls in each region
        variant_num = len(variant_interval)
        print "For " + var_type + ", there are " + str(
            variant_num) + " variant calls from the clustersed studies..."
        for i in xrange(variant_num):
            iv = variant_interval[i]
            try:
                ga[iv] += 1
                nssd[iv] += variant_id[i]
            except:
                iv.length == 0

        bedgraph = opt.outdir + '/' + var_type + '_dbVar.bedgraph'
        ga.write_bedgraph_file(bedgraph, strand=".", track_options="")

        gvf = opt.outdir + '/' + var_type + '_dbVar.gvf'
        write_to_gvf(ga, nssd, var_type, gvf)
예제 #57
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'alignment_file',
        metavar='in.aln',
        help="input alignment file in SAM or BAM format. Use '-' to indicate "
        "that input should be taken from standard input (stdin)")
    parser.add_argument(
        'feature_file',
        metavar='in.gff3',
        help="input feature annotation file in GFF3 format. Use '-' to indicate "
        "that input should be taken from standard input (stdin)")
    parser.add_argument(
        '-m',
        '--mapping',
        metavar='in.json',
        dest='map_files',
        action=ParseSeparator,
        sep=',',
        help="input one or more relational databases, in JSON format, "
        "containing features mapped to feature categories, such as genes "
        "to gene families or exons to genes. Abundance estimates for the "
        "given feature category will be reported in place of features. "
        "Multiple input files can be provided by separating them with a "
        "comma and no spaces")
    parser.add_argument(
        '-c',
        '--category',
        metavar='FIELD',
        dest='category',
        help="field in the relational database representing how features "
        "are categorized. WARNING: if the value type of the selected field "
        "is a list, then the category abundance totals can be greater than "
        "the feature abundance totals")
    gff_group = parser.add_argument_group('GFF3 arguments')
    gff_group.add_argument(
        '-t',
        '--type',
        metavar='TYPE',
        dest='ftype',
        default='CDS',
        help="feature type (3rd column in GFF file) to estimate abundance for "
        "[default: CDS]. All features of other type will be ignored")
    gff_group.add_argument(
        '-a',
        '--attr',
        metavar='ATTRIBUTE',
        default="Name",
        help="GFF attribute to use as the ID for the calculated abundances "
        "[default: 'Name']. This value will also be used as the search "
        "ID in the relational database, if provided")
    aln_group = parser.add_argument_group('SAM/BAM arguments')
    aln_group.add_argument(
        '-f',
        '--format',
        metavar='FORMAT',
        dest='aformat',
        choices=['bam', 'sam'],
        default='bam',
        help="input alignment file format [default: bam]. Options are 'sam' "
        "or 'bam'")
    aln_group.add_argument(
        '-q',
        '--qual',
        metavar='THRESH',
        dest='minqual',
        type=int,
        default=2,
        help="skip all reads with alignment quality lower than the threshold "
        "[default: 2]")
    aln_group.add_argument(
        '-s',
        '--sorting',
        metavar='ORDER',
        dest='order',
        choices=["position", "name"],
        default='position',
        help="alignment file sorting scheme. Options are 'position' and "
        "'name' [default: position]. Alignments must be pre-sorted "
        "either by position/coordinates or by read name. This option "
        "will be ignored for single-end reads")
    aln_group.add_argument(
        '-b',
        '--buffer',
        metavar='BYTES',
        dest='buffer_size',
        type=int,
        default=3145728,
        help="buffer size for paired reads in the alignment file if sorted by "
        "position [default: 3145728 (3GB)]. This value should be "
        "increased if memory issues are encountered")
    count_group = parser.add_argument_group('quantification arguments')
    count_group.add_argument(
        '-e',
        '--mode',
        metavar='MODE',
        choices=["union", "intersection-strict", "intersection-nonempty"],
        default="union",
        help="mode for handling different alignment scenarios. Options are "
        "'union', 'intersection-strict', and 'intersection-nonempty' "
        "[default: union]. The modes will count alignments differently "
        "depending on whether a read/pair overlaps more than one feature "
        "or only partially aligns to a single feature. The most "
        "inclusive mode is 'union' when given with the nonunique flag, "
        "and the least inclusive is 'intersection-strict'")
    count_group.add_argument(
        '-u',
        '--units',
        metavar='UNITS',
        dest='norm',
        action=ParseSeparator,
        sep=',',
        default='counts',
        help="comma-separated list of units to output abundance estimates in "
        "[default: counts]. Options are 'counts', 'fpk' (fragments per "
        "kilobase of feature), 'fpkm' (fragements per kilobase of "
        "feature per million mapped fragments), 'tpm' "
        "(transcripts/fragments per million), 'prop', and 'custom'. If "
        "other than 'counts', features will be normalized by recruitment "
        "length, which will be calculated from the start and end fields "
        "of the GFF3 file. This is the sole normalization method used "
        "when transforming counts to FPK, and is useful to correct for "
        "differences in feature lengths within a sample. In addition to "
        "feature length, FPKM and TPM attempt to account for differences "
        "between samples in sequencing effort. An advantage of TMP over "
        "FPKM is that TPM is a proportional measurement, making it "
        "easier to identify the extent that the relative 'importance' of "
        "a given feature changes between samples. A custom transformation "
        "can also be performed when used with the -k/--coeff argument, in "
        "which case the length normalized proportion of a feature will be "
        "multiplied by the provided scaling factor.")
    count_group.add_argument(
        '-k',
        '--coeff',
        metavar='MUL',
        dest='sfactor',
        type=float,
        default=1,
        help="multiplier to use when 'custom' is given to -u/--units "
        "[default: 1]")
    count_group.add_argument(
        '--cdna',
        dest='transcripts',
        action='store_true',
        help="sequences represent cDNA [default: False]. Whether sequences are "
        "from gDNA or cDNA will determine how the length of a feature is "
        "calculated for normalization. If cDNA, effective length will "
        "serve as feature length")
    count_group.add_argument(
        '--nonunique',
        action='store_true',
        help="allow reads to align with more than one feature")
    output_group = parser.add_argument_group('output control arguments')
    output_group.add_argument(
        '-o',
        '--outpref',
        type=str,
        metavar='PREFIX',
        dest='outpref',
        default='sample',
        help="prefix for the output tabular files containing feature abundance "
        "estimates [default: sample]. File names will be appended with "
        "the units, file format, and compression algorithm, if relevant "
        "[e.g. sample.counts.csv.gz]")
    output_group.add_argument(
        '--filter',
        dest='cat_only',
        action='store_true',
        help="only output abundances for features with an associated feature "
        "category [default: output all]")
    compression = output_group.add_mutually_exclusive_group()
    compression.add_argument('--gzip',
                             dest='gzipped',
                             action='store_true',
                             help="compress output using the gzip algorithm")
    compression.add_argument('--bzip2',
                             dest='bzipped',
                             action='store_true',
                             help="compress output using the bzip2 algorithm")
    compression.add_argument('--lzma',
                             dest='lzma',
                             action='store_true',
                             help="compress output using the lzma algorithm")
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    args = parser.parse_args()

    # Argument sanity checks
    if (args.category and not args.map_files) or \
        (args.map_files and not args.category):
        parser.error("error: -m/--mapping and -c/--category must be supplied "
                     "together")

    if args.alignment_file == '-' and args.feature_file == '-':
        parser.error("error: standard input (stdin) can only be redirected to "
                     "a single positional argument")

    # Output run information
    all_args = sys.argv[1:]
    print("{} {!s}".format('count_features', __version__), file=sys.stderr)
    print(textwrap.fill("Command line parameters: {}"\
          .format(' '.join(all_args)), 79), file=sys.stderr)
    print("", file=sys.stderr)

    # Track program run-time
    start_time = time()

    # Assign variables based on user inputs
    if args.gzipped:
        compression = '.gz'
    elif args.bzipped:
        compression = '.bz2'
    elif args.lzma:
        compression = '.xz'
    else:
        compression = ''

    allowed_units = ["counts", "tpm", "custom", "prop", "fpk", "fpkm"]

    out_handles = {}
    for unit in args.norm:
        if unit not in allowed_units:
            print("warning: unknown metric of abundance '{}' provided to "
                  "-u/--unit. Please see the help message for a list of the "
                  "allowed units".format(unit),
                  file=sys.stderr)
            continue

        outfile = "{}.{}.csv{}".format(args.outpref, unit, compression)
        try:
            out_h = open_io(outfile, mode='wb')
        except AttributeError:
            print("error: unable to write to '{}'".format(outfile), \
                  file=sys.stderr)
            sys.exit(1)

        out_handles[unit] = out_h

    if not out_handles:
        print(
            "error: no output files can be created. Please re-run with one "
            "or more of the accepted units of abundance",
            file=sys.stderr)
        sys.exit(1)

    overlap_mode = args.mode
    minaqual = args.minqual
    feature_type = args.ftype
    id_field = args.attr
    category_field = args.category
    are_transcripts = args.transcripts
    category_only = args.cat_only
    multi_aln = args.nonunique

    match_types = ('M', '=', 'X')

    if args.aformat == "sam":
        align_reader = HTSeq.SAM_Reader
    else:  #must be BAM then
        align_reader = HTSeq.BAM_Reader

    if args.map_files:
        mapping = load_dbs(args.map_files, fields=[category_field], csv=False)
    else:
        mapping = None

    # Store features in genomic arrays
    features = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    counts = {}

    # Iterate over GFF3 file, storing features to estimate coverage for
    no_attr = 0
    f_totals = 0
    ftype_totals = 0

    try:
        if args.feature_file == '-':
            gff = HTSeq.GFF_Reader(sys.stdin)
        else:
            gff = HTSeq.GFF_Reader(args.feature_file)

        for f in gff:
            f_totals += 1

            try:
                feature_id = f.attr[id_field]
            except KeyError:
                no_attr += 1
                feature_id = "unkwn_{:08}".format(no_attr)

            # Skip features of wrong type
            if feature_type:
                if f.type == feature_type:
                    ftype_totals += 1
                else:
                    continue

            # Store feature length for normalization
            feature_length = abs(f.iv.end - f.iv.start + 1)

            features[f.iv] += feature_id  #for mapping alignments
            counts[feature_id] = {'count': 0, 'length': feature_length}
    except:
        print("error: problem occured when processing GFF3 file at line {}".
              format(gff.get_line_number_string()),
              file=sys.stderr)
        sys.exit(1)

    # Verify GFF3 file contains features of the specified type
    if ftype_totals == 0:
        print("error: no features of type '{}' found.\n".format(feature_type),
              file=sys.stderr)
        sys.exit(1)

    if no_attr > 0:
        print("warning: found {!s} features without a '{}' attribute.\n"\
              .format(no_attr, id_field), file=sys.stderr)

    # Check alignment file formatting
    try:
        if args.alignment_file == '-':
            read_seq_file = align_reader(sys.stdin)
            read_seq_iter = iter(read_seq_file)
            first_read = next(read_seq_iter)
            read_seq = itertools.chain([first_read], read_seq_iter)
        else:
            read_seq_file = align_reader(args.alignment_file)
            read_seq = read_seq_file
            first_read = next(iter(read_seq))
    except:
        print(
            "error: unable to read the alignment file. Please verify that "
            "the formatting is correct.",
            file=sys.stderr)
        sys.exit(1)

    pe_mode = first_read.paired_end  #reads are paired-end or single-end
    if pe_mode:
        if args.order == "name":
            read_seq = HTSeq.pair_SAM_alignments(read_seq)
        else:  #order is by position
            read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq, \
                       max_buffer_size=args.buffer_size)

    # Iterate over alignment file
    empty = 0  #reads aligned somewhere in the assembly, but not to a feature
    duplicate = 0  #reads are duplicates of other reads
    ambiguous = 0  #reads overlapping more than one feature
    notaligned = 0  #unaligned reads
    lowqual = 0  #reads not passing minimum threshold for alignment quality
    nonunique = 0  #reads having multiple alignments with similar score
    r_totals = 0  #total reads
    aln_totals = 0  #correctly mapped to a feature
    fld = []  #fragment length / insert-size distribution
    for r in read_seq:

        r_totals += 1

        if not pe_mode:  #single-end read mapping

            # Check if read aligned
            if not r.aligned:
                notaligned += 1
                continue

            # Check if the read aligned uniquely
            try:
                if r.optional_field("NH") > 1:
                    nonunique += 1
                    print("warning: read '{}' has multiple alignments with "
                          "similar score.\n".format(r.iv.chrom), \
                          file=sys.stderr)
                    continue
            except KeyError:
                pass

            # Cehck if the alignment passed the quality requirement
            if r.aQual < minaqual:
                lowqual += 1
                continue

            # Check whether the read was marked as a duplciate
            if r.pcr_or_optical_duplicate:
                duplicate += 1
                continue

            # Store read coordiantes
            iv_seq = (co.ref_iv for co in r.cigar if co.type in match_types \
                      and co.size > 0)

        else:  #paired-end read mapping

            # Store pair coordinates
            try:
                first_r, second_r = r
            except ValueError:
                notaligned += 1
                continue

            if first_r is None or second_r is None:
                notaligned += 1
                continue

            if first_r is not None and first_r.aligned:
                iv_seq = (co.ref_iv for co in first_r.cigar if co.type in \
                          match_types and co.size > 0)
            else:
                iv_seq = tuple()

            if second_r is not None and second_r.aligned:
                iv_seq = itertools.chain(iv_seq, (co.ref_iv for co in \
                         second_r.cigar if co.type in match_types and \
                         co.size > 0))
            else:
                if (first_r is None) or not (first_r.aligned):
                    notaligned += 1
                    continue

            # Check whether either read aligned more than once
            try:
                if (first_r.optional_field("NH") > 1) or \
                   (second_r.optional_field("NH") > 1):
                    nonunique += 1
                    print("warning: read '{}' has multiple alignments with "
                          "similar score.\n".format(first_r.iv.chrom), \
                          file=sys.stderr)
                    continue
            except KeyError:
                pass

            # Check if both reads passed the quality requirement
            if first_r.aQual < minaqual or second_r.aQual < minaqual:
                lowqual += 1
                continue

            # Check if the read pair was marked as a duplicate
            if first_r.pcr_or_optical_duplicate or \
                second_r.pcr_or_optical_duplicate:
                duplicate += 1
                continue

            # Append fragment length/insert-size to distribution
            try:
                fld.append(first_r.inferred_insert_size)
            except AttributeError:
                pass

        # Handle case where reads might overlap more than one feature
        try:
            if overlap_mode == "union":
                fs = set()  #store feature names when reads align
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom

                    for iv2, fs2 in features[iv].steps():
                        fs = fs.union(fs2)

            else:  #intersection
                fs = None
                for iv in iv_seq:
                    if iv.chrom not in features.chrom_vectors:
                        raise UnknownChrom

                    for iv2, fs2 in features[iv].steps():
                        if len(fs2
                               ) > 0 or overlap_mode == "intersection-strict":
                            if fs is None:
                                fs = fs2.copy()
                            else:
                                fs = fs.intersection(fs2)

            # If a read correctly mapped to a feature, increment its abundance
            if not fs:
                empty += 1
                continue
            elif len(fs) > 1:
                ambiguous += 1
                if not multi_aln:
                    continue
            else:
                aln_totals += 1

            for fsi in list(fs):
                counts[fsi]['count'] += 1

        except UnknownChrom:
            empty += 1

    unaln_totals = empty + ambiguous + lowqual + notaligned + nonunique + \
                      duplicate
    nmapped = aln_totals + empty + ambiguous + nonunique + lowqual

    for unit in out_handles:
        # Set scaling function
        if unit == 'fpk':
            norm_method = scale_abundance_fpk
            scaling_factor = None
        elif unit == 'fpkm':
            norm_method = scale_abundance_fpkm
            # Scaling factor is all mapped reads
            scaling_factor = nmapped
            print("info: the total number of mapped reads used in calculation "
                  "of FPKM is {!s}.\n".format(nmapped),
                  file=sys.stderr)
        elif unit == 'tpm':
            norm_method = scale_abundance_tpm
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            # Scaling factor is sum of all reads per base rates
            scaling_factor = rate_sum
        elif unit == 'custom':
            norm_method = scale_abundance_prop
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            scaling_factor = args.sfactor / rate_sum
        elif unit == 'prop':
            norm_method = scale_abundance_prop
            rates = [counts[j]['count'] / counts[j]['length'] for j in counts]
            rate_sum = sum(rates)
            print("info: the sum of all counts per bp rates used in "
                    "estimating fragment proportions is {:.2f}.\n"\
                  .format(rate_sum), file=sys.stderr)
            scaling_factor = 1 / rate_sum
        else:  #default is counts
            norm_method = scale_abundance_none
            scaling_factor = None

        if are_transcripts and not pe_mode:
            print(
                "warning: unable to calculate effective length from single-end "
                "reads. Will use sequence length instead.\n",
                file=sys.stderr)
            calc_length = return_first_arg
        elif are_transcripts and pe_mode:
            calc_length = compute_effective_length
        else:
            calc_length = return_first_arg

        out_h = out_handles[unit]

        # Abundance normalization
        abundances = {}
        unkwn_feat = 0
        no_map = 0
        for feature in counts:

            fcount = counts[feature]['count']
            flen = calc_length(counts[feature]['length'], fld)

            feature_abundance = norm_method(fcount, flen, scaling_factor)

            # Map to higher order features, if applicable
            if category_field:
                try:
                    # Ensure that feature has corresponding entry in database
                    feature_map = mapping[feature]
                except KeyError:
                    no_map += 1
                    if not category_only:
                        # Keep all features, even the uncategorized ones
                        abundances[feature] = abundances.get(feature, 0) + \
                                              feature_abundance
                    continue
                else:
                    try:
                        # Ensure that entry has relevant category field
                        category = feature_map[category_field]
                    except KeyError:
                        unkwn_feat += 1
                        if not category_only:
                            abundances[feature] = abundances.get(feature, 0) + \
                                                  feature_abundance
                        continue

                # Handle case where feature has more than one category, such
                # as if a protein sequence is assigned to more than one gene
                # family
                categories = [category] if not type(category) == type(list()) \
                    else category
                for category in categories:
                    abundances[category.lstrip()] = \
                        abundances.get(category, 0) + feature_abundance

            else:
                abundances[feature] = abundances.get(feature, 0) + \
                                      feature_abundance

        # "UNMAPPED" can be interpreted as a single unknown gene of length one
        # kilobase recruiting all reads that failed to map to input features
        #abundances['UNMAPPED'] = unaln_totals

        # Output abundances sorted by key name
        for fn in sorted(abundances):
            if not fn.startswith("unkwn_"):
                write_io(out_h, "{}\t{!s}\n".format(fn, abundances[fn]))

        out_h.close()

    if unkwn_feat > 0:
        print("warning: found '{!s}' features without the '{}' field in the "
              "relational database.\n".format(unkwn_feat, category_field), \
              file=sys.stderr)

    if no_map > 0:
        print("warning: found {!s} features without an entry in the "
              "relational database.\n".format(no_map),
              file=sys.stderr)

    # Output statistics
    print("Features processed:", file=sys.stderr)
    print("  - feature totals:\t{!s}".format(f_totals), file=sys.stderr)
    if feature_type:
        print("  - of relevant type:\t{!s}".format(ftype_totals), \
              file=sys.stderr)
    print("  - unique features:\t{!s}".format(len(counts)), file=sys.stderr)
    print("Reads processed:", file=sys.stderr)
    print("  - read totals:\t{!s}".format(r_totals), file=sys.stderr)
    print("  - successfully mapped:\t{!s}".format(aln_totals), \
          file=sys.stderr)
    if multi_aln:
        print("    - ambiguous alignment:\t{!s}".format(ambiguous), \
              file=sys.stderr)
    print("  - unsuccessfully mapped:\t{!s}".format(unaln_totals), \
          file=sys.stderr)
    print("    - no feature\t{!s}".format(empty), file=sys.stderr)
    if not multi_aln:
        print("    - ambiguous alignment\t{!s}".format(ambiguous), \
              file=sys.stderr)
    print("    - too low alignment quality\t{!s}".format(lowqual), \
          file=sys.stderr)
    print("    - not aligned\t{!s}".format(notaligned), file=sys.stderr)
    print("    - duplicate\t{!s}".format(duplicate), file=sys.stderr)
    print("    - alignment not unique\t{!s}".format(nonunique), \
          file=sys.stderr)
    print("", file=sys.stderr)

    # Calculate and print program run-time
    end_time = time()
    total_time = (end_time - start_time) / 60.0
    print("It took {:.2e} minutes to count {!s} fragments for {!s} features"\
          .format(total_time, r_totals, f_totals), file=sys.stderr)
    print("", file=sys.stderr)
예제 #58
0
def htseq_read_gtf(fg):
    gtf = HTSeq.GFF_Reader(fg)
    exons = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    for feat in gtf:
        if feat.type == 'exon':
            exons[feat.iv] += feat.attr['gene_id']
예제 #59
0
파일: count.py 프로젝트: simon-anders/htseq
def count_reads_in_features(sam_filenames, gff_filename,
                            samtype,
                            order, max_buffer_size,
                            stranded, overlap_mode,
                            multimapped_mode,
                            secondary_alignment_mode,
                            supplementary_alignment_mode,
                            feature_type, id_attribute,
                            additional_attributes,
                            quiet, minaqual, samouts):

    def write_to_samout(r, assignment, samoutfile):
        if samoutfile is None:
            return
        if not pe_mode:
            r = (r,)
        for read in r:
            if read is not None:
                read.optional_fields.append(('XF', assignment))
                samoutfile.write(read.get_sam_line() + "\n")

    if samtype == "sam":
        SAM_or_BAM_Reader = HTSeq.SAM_Reader
        samname = 'SAM'
    elif samtype == "bam":
        SAM_or_BAM_Reader = HTSeq.BAM_Reader
        samname = 'BAM'
    else:
        raise ValueError("Unknown input format %s specified." % samtype)

    if samouts != []:
        if len(samouts) != len(sam_filenames):
            raise ValueError(
                    'Select the same number of {:} input and output files'.format(samname))
        # Try to open samout files early in case any of them has issues
        for samout in samouts:
            with open(samout, 'w'):
                pass

    # Try to open samfiles to fail early in case any of them is not there
    if (len(sam_filenames) != 1) or (sam_filenames[0] != '-'):
        for sam_filename in sam_filenames:
            with open(sam_filename):
                pass

    # CIGAR match characters (including alignment match, sequence match, and
    # sequence mismatch
    com = ('M', '=', 'X')

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                            "Feature %s does not contain a '%s' attribute" %
                            (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                            "Feature %s at %s does not have strand information but you are "
                            "running htseq-count in stranded mode. Use '--stranded=no'." %
                            (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                        f.attr[attr] if attr in f.attr else ''
                        for attr in additional_attributes]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
    except:
        sys.stderr.write(
            "Error occured when processing GFF file (%s):\n" %
            gff.get_line_number_string())
        raise

    if not quiet:
        sys.stderr.write("%d GFF lines processed.\n" % i)
        sys.stderr.flush()

    if len(counts) == 0:
        sys.stderr.write(
            "Warning: No features of type '%s' found.\n" % feature_type)

    counts_all = []
    empty_all = []
    ambiguous_all = []
    notaligned_all = []
    lowqual_all = []
    nonunique_all = []
    for isam, (sam_filename) in enumerate(sam_filenames):
        if samouts != []:
            samoutfile = open(samouts[isam], 'w')
        else:
            samoutfile = None

        try:
            if sam_filename == "-":
                read_seq_file = SAM_or_BAM_Reader(sys.stdin)
            else:
                read_seq_file = SAM_or_BAM_Reader(sam_filename)
            read_seq_iter = iter(read_seq_file)
            # Catch empty BAM files
            try:
                first_read = next(read_seq_iter)
                pe_mode = first_read.paired_end
            except:
                first_read = None
                pe_mode = False
            if first_read is not None:
                read_seq = itertools.chain([first_read], read_seq_iter)
            else:
                read_seq = []
        except:
            sys.stderr.write(
                "Error occured when reading beginning of {:} file.\n".format(
                    samname))
            raise

        try:
            if pe_mode:
                if ((supplementary_alignment_mode == 'ignore') and
                   (secondary_alignment_mode == 'ignore')):
                    primary_only = True
                else:
                    primary_only = False
                if order == "name":
                    read_seq = HTSeq.pair_SAM_alignments(
                            read_seq,
                            primary_only=primary_only)
                elif order == "pos":
                    read_seq = HTSeq.pair_SAM_alignments_with_buffer(
                            read_seq,
                            max_buffer_size=max_buffer_size,
                            primary_only=primary_only)
                else:
                    raise ValueError("Illegal order specified.")
            empty = 0
            ambiguous = 0
            notaligned = 0
            lowqual = 0
            nonunique = 0
            i = 0
            for r in read_seq:
                if i > 0 and i % 100000 == 0 and not quiet:
                    sys.stderr.write(
                        "%d %s alignment record%s processed.\n" %
                        (i, samname, "s" if not pe_mode else " pairs"))
                    sys.stderr.flush()

                i += 1
                if not pe_mode:
                    if not r.aligned:
                        notaligned += 1
                        write_to_samout(r, "__not_aligned", samoutfile)
                        continue
                    if ((secondary_alignment_mode == 'ignore') and
                       r.not_primary_alignment):
                        continue
                    if ((supplementary_alignment_mode == 'ignore') and
                       r.supplementary):
                        continue
                    try:
                        if r.optional_field("NH") > 1:
                            nonunique += 1
                            write_to_samout(
                                    r,
                                    "__alignment_not_unique",
                                    samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if r.aQual < minaqual:
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r.cigar if co.type in com
                                  and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv)
                                  for co in r.cigar if (co.type in com and
                                                        co.size > 0))
                else:
                    if r[0] is not None and r[0].aligned:
                        if stranded != "reverse":
                            iv_seq = (co.ref_iv for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                        else:
                            iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                      if co.type in com and co.size > 0)
                    else:
                        iv_seq = tuple()
                    if r[1] is not None and r[1].aligned:
                        if stranded != "reverse":
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (invert_strand(co.ref_iv) for co in r[1].cigar
                                    if co.type in com and co.size > 0))
                        else:
                            iv_seq = itertools.chain(
                                    iv_seq,
                                    (co.ref_iv for co in r[1].cigar
                                     if co.type in com and co.size > 0))
                    else:
                        if (r[0] is None) or not (r[0].aligned):
                            write_to_samout(r, "__not_aligned", samoutfile)
                            notaligned += 1
                            continue
                    if secondary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].not_primary_alignment:
                            continue
                        elif (r[1] is not None) and r[1].not_primary_alignment:
                            continue
                    if supplementary_alignment_mode == 'ignore':
                        if (r[0] is not None) and r[0].supplementary:
                            continue
                        elif (r[1] is not None) and r[1].supplementary:
                            continue
                    try:
                        if ((r[0] is not None and r[0].optional_field("NH") > 1) or
                           (r[1] is not None and r[1].optional_field("NH") > 1)):
                            nonunique += 1
                            write_to_samout(r, "__alignment_not_unique", samoutfile)
                            if multimapped_mode == 'none':
                                continue
                    except KeyError:
                        pass
                    if ((r[0] and r[0].aQual < minaqual) or
                       (r[1] and r[1].aQual < minaqual)):
                        lowqual += 1
                        write_to_samout(r, "__too_low_aQual", samoutfile)
                        continue

                try:
                    if overlap_mode == "union":
                        fs = set()
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                fs = fs.union(fs2)
                    elif overlap_mode in ("intersection-strict",
                                          "intersection-nonempty"):
                        fs = None
                        for iv in iv_seq:
                            if iv.chrom not in features.chrom_vectors:
                                raise UnknownChrom
                            for iv2, fs2 in features[iv].steps():
                                if ((len(fs2) > 0) or
                                   (overlap_mode == "intersection-strict")):
                                    if fs is None:
                                        fs = fs2.copy()
                                    else:
                                        fs = fs.intersection(fs2)
                    else:
                        sys.exit("Illegal overlap mode.")

                    if fs is None or len(fs) == 0:
                        write_to_samout(r, "__no_feature", samoutfile)
                        empty += 1
                    elif len(fs) > 1:
                        write_to_samout(r, "__ambiguous[" + '+'.join(fs) + "]",
                                        samoutfile)
                        ambiguous += 1
                    else:
                        write_to_samout(r, list(fs)[0], samoutfile)

                    if fs is not None and len(fs) > 0:
                        if multimapped_mode == 'none':
                            if len(fs) == 1:
                                counts[list(fs)[0]] += 1
                        elif multimapped_mode == 'all':
                            for fsi in list(fs):
                                counts[fsi] += 1
                        else:
                            sys.exit("Illegal multimap mode.")


                except UnknownChrom:
                    write_to_samout(r, "__no_feature", samoutfile)
                    empty += 1

        except:
            sys.stderr.write(
                "Error occured when processing %s input (%s):\n" %
                (samname, read_seq_file.get_line_number_string()))
            raise

        if not quiet:
            sys.stderr.write(
                "%d %s %s processed.\n" %
                (i, samname, "alignments " if not pe_mode else "alignment pairs"))
            sys.stderr.flush()

        if samoutfile is not None:
            samoutfile.close()

        counts_all.append(counts.copy())
        for fn in counts:
            counts[fn] = 0
        empty_all.append(empty)
        ambiguous_all.append(ambiguous)
        lowqual_all.append(lowqual)
        notaligned_all.append(notaligned)
        nonunique_all.append(nonunique)

    pad = ['' for attr in additional_attributes]
    for fn in sorted(counts.keys()):
        print('\t'.join([fn] + attributes[fn] + [str(c[fn]) for c in counts_all]))
    print('\t'.join(["__no_feature"] + pad + [str(c) for c in empty_all]))
    print('\t'.join(["__ambiguous"] + pad + [str(c) for c in ambiguous_all]))
    print('\t'.join(["__too_low_aQual"] + pad + [str(c) for c in lowqual_all]))
    print('\t'.join(["__not_aligned"] + pad + [str(c) for c in notaligned_all]))
    print('\t'.join(["__alignment_not_unique"] + pad + [str(c) for c in nonunique_all]))