Esempio n. 1
0
 def Trim_Phix(Info,exp,storing_loc,pair_end):
     to_clean = []
     to_clean.append(os.path.join(storing_loc,exp + '_Aligned.sam'))
     if pair_end:
         to_clean.append(os.path.join(storing_loc,exp + '_pairend_Aligned.sam'))
         
     for library in to_clean:                                                                               
         Phix_blast = HTSeq.SAM_Reader(library)
         if to_clean.index(library) == 0:
             storage_file = os.path.join(storing_loc,exp + '_PhixCleaned.fastq')
         elif to_clean.index(library) == 1:
             storage_file = os.path.join(storing_loc,exp + '_pairend_PhixCleaned.fastq')
             
         with open(storage_file,"w") as Library_Trimmed_Phix:
             count_total = 0
             count_selected = 0
             for read in Phix_blast:
                 count_total +=1
                 if not read.aligned:
                     selected_fastq = read.read
                     selected_fastq.write_to_fastq_file(Library_Trimmed_Phix)
                     count_selected +=1
                 if count_total%1000000 == 0:
                     print "Analyzed ", count_total, "sequences."
 
         string = "\tSaved in: %s\n\t\tSelected(NO Phix) sequences %i of %i."% (storage_file,count_selected,count_total)
         Info.print_save(exp,string)
Esempio n. 2
0
def bam_count(args):
    bam = HTSeq.SAM_Reader(args.fi)
    #exons = htseq_read_gtf(args.fg)
    cnts = collections.Counter()
    for bundle in HTSeq.pair_SAM_alignments_with_buffer(bam):
        if len(bundle) != 1:
            continue
        aln1, aln2 = bundle[0]
        if not aln1.aligned and aln2.aligned:
            cnts["_unmapped"] += 1
            continue
        gids = set()
        for iv, val in exons[aln1.iv].steps():
            gids |= val
        for iv, val in exons[aln2.iv].steps():
            gids |= val
        if len(gids) == 1:
            gid = list(gids)[0]
            cnts[gid] += 1
        elif len(gids) == 0:
            cnts["_no_feature"] += 1
        else:
            cnts["_ambiguous"] += 1
    for gid in cnts:
        print("%s\t%d" % (gid, cnts[gid]))
Esempio n. 3
0
def output_merged_deletion_reads(input_sam_file):
    output_file = input_sam_file.split(":")[0] + "merged_pos.txt"
    input_sam = HTSeq.SAM_Reader(input_sam_file)
    with open(output_file, "w") as output_list:
        # output_list.write("read_ID\tread_start\tgap_start\tgap_end\tread_end\tdel_size\tother_info\n")
        for sam_line in input_sam:

            (clipping, read_start, read_start_clip, read_end_clip, read_end,
             insert_size, mapped_size) = cigar_analyse(sam_line)

            if clipping > 0:
                if (read_start_clip -
                        read_start) >= 30 and (read_end - read_end_clip) >= 30:
                    output_list.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                      (sam_line.get_sam_line().split("\t")[0],
                                       str(read_start), str(read_start_clip),
                                       str(read_end_clip), str(read_end),
                                       str(insert_size), "merged"))
                elif clipping > 1 and (mapped_size + read_start + read_end_clip
                                       - read_end - read_start_clip) >= 30:
                    output_list.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                      (sam_line.get_sam_line().split("\t")[0],
                                       str(read_start), str(read_start_clip),
                                       str(read_end_clip), str(read_end),
                                       str(insert_size), "merged,multi_clip"))

    return output_file
Esempio n. 4
0
 def __iter__(self):
     if filetype.upper() == "BED":
         for line in HTSeq.BED_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "GFF" or filetype.upper() == "GTF":
         for line in HTSeq.GFF_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "SAM":
         for line in HTSeq.SAM_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif filetype.upper() == "BAM":
         for line in HTSeq.BAM_Reader(filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
     elif self.filetype.upper() == "OTHER":
         for line in func(self.filepath):
             line.iv.start -= window_length
             line.iv.end += window_length
             yield line.iv
Esempio n. 5
0
def count_single_file(sam_file, handle):
    print "counting alignments in " + sam_file
    alignment_iterator = HTSeq.SAM_Reader(sam_file)
    count = 0
    for alignment in alignment_iterator:
        count += 1
    handle.write(str(count) + "  " + sam_file + "\n")
def main(cl=None):
	'''
	Implements the Usage exception handler that can be raised from anywhere 
	in process.  

	'''
	if cl is None:
		cl = CommandLine()
	else :
		cl = CommandLine(['-r'])

	try:
		print cl.args  # print the parsed argument string
		alignment_file = HTSeq.SAM_Reader(cl.args["sam_file"])
		
		# Get coverage for the whole genome
		cvg = HTSeq.GenomicArray( "auto", stranded=False, typecode='i' )
		for alngt in alignment_file:
			if alngt.aligned:
				cvg[ alngt.iv ] += 1
				
		# Write a "Wiggle" file for genome browser viewing
		cvg.write_bedgraph_file(cl.args["output_prefix"]+".wig")
				
		# Now need to iterate over every gene/transcript and get the
		# per-transcript coverage
		# gtf_file = HTSeq.GFF_Reader("/home/pvcastro/reference_known_genes.gtf")
		
	except Usage, err:
	   cl.do_usage_and_die(err.msg)
Esempio n. 7
0
def count_features(samfile,
                   features=None,
                   gtffile=None,
                   readcounts=None,
                   merge=False):
    """Count reads in features from an alignment, if no truecounts we
       assume a non-collapsed file was used to map.

       Args:
           samfile: mapped sam file
           gtffile: feature file
           features: annotations read from bed or gtf file
           readcounts: read counts from original (un-collapsed) file
           merge: whether to merge the gtf fields with the results
       Returns:
           dataframe of genes with total counts
    """

    if gtffile != None:
        gtf = HTSeq.GFF_Reader(gtffile)
        features = get_exons(gtf)
    sam = HTSeq.SAM_Reader(samfile)
    if type(readcounts) is pd.DataFrame:
        readcounts = {r.seq: r['reads'] for i, r in readcounts.iterrows()}
    import collections
    counts = collections.Counter()
    for almnt in sam:
        seq = str(almnt.read)
        if readcounts is not None and seq in readcounts:
            c = readcounts[seq]
        else:
            c = 1
        if not almnt.aligned:
            counts["_unmapped"] += c
            continue
        gene_ids = set()
        for iv, val in features[almnt.iv].steps():
            gene_ids |= val
        #print almnt.iv, almnt.read, readcounts[seq], gene_ids
        if len(gene_ids) == 1:
            gene_id = list(gene_ids)[0]
            counts[gene_id] += c
        elif len(gene_ids) == 0:
            counts["_no_feature"] += c
        else:
            counts["_ambiguous"] += c
    result = pd.DataFrame(counts.items(), columns=['name', 'reads'])
    #result['norm'] = result.reads/result.reads.sum()*1e6
    result = result.sort_values(by='reads', ascending=False)

    um = ['_no_feature', '_unmapped']
    mapped = float(result[-result.name.isin(um)].reads.sum())
    total = result.reads.sum()

    print('%s/%s reads counted, %.2f percent' %
          (mapped, total, mapped / total * 100))
    if merge == True and gtffile != None:
        result = merge_features(result, gtffile)
    return result
Esempio n. 8
0
def analyze_integration_reads(input_sam_file_1, input_sam_file_2):
    input_sam_1 = HTSeq.SAM_Reader(input_sam_file_1)
    input_sam_2 = HTSeq.SAM_Reader(input_sam_file_2)

    sam1_reads = set()
    sam2_reads = set()

    for sam_line_1 in input_sam_1:
        if sam_line_1.aligned:
            sam1_reads.add("@" + sam_line_1.get_sam_line().split("\t")[0])

    for sam_line_2 in input_sam_2:
        if sam_line_2.aligned:
            sam2_reads.add("@" + sam_line_2.get_sam_line().split("\t")[0])

    kept_reads = sam1_reads.intersection(sam2_reads)
    return kept_reads
Esempio n. 9
0
def report_indel_from_amp(sortsam, ODNS, region_start, region_end):
    Total_read = 0
    ODNS_inserted = 0
    INDEL_read = 0
    substitution_read = 0
    for aln in HTSeq.SAM_Reader(sortsam):
        ### one alignment at a time, alignment independent of read name ###
        ### only assign one type of mutation for one read according to priority ODNs> INDEL> substitution ###
        if aln.aligned and match_base_filter(aln) and check_AS_filter(aln):
            ### pass AS and # matches filter ###
            Total_read += 1
            ### count number of reads being edited ###
            if soft_clip_filter(ODNS, aln):
                ### soft-clipped part contain ODNs == ODN insertion ###
                ODNS_inserted += 1
            else:
                cigar_property = [(c.type, c.size) for c in aln.cigar]
                if [p for p in cigar_property if p[0] == "I" and p[1] > 20
                    ] != []:
                    if check_ODN_presence_insertion(aln):
                        ### count ODNS insertion marked as I ###
                        ODNS_inserted += 1
                    else:
                        INDEL_read += 1


### these reads were either modified or not modified, so record all the indel mutations ###
                elif [
                        p[0] for p in cigar_property
                        if p[1] < 20 and p[0] == "D" or p[0] == "I"
                ]:
                    ### distinguish insertion from ODNS insertion ###
                    indel_region = [
                        [c.ref_iv.start, c.ref_iv.end] for c in aln.cigar
                        if c.type == "D" or c.type == "I" and c.size < 20
                    ]
                    valid_indel = [
                        r for r in indel_region
                        if r[0] >= region_start and r[1] <= region_end
                    ]
                    if len(valid_indel) > 0:
                        ### count indel over substitutions ###
                        INDEL_read += 1
            #### Cannot sort out substitution ###
                else:
                    if "M" in [p[0] for p in cigar_property]:
                        ### these are reads having everything as matches, so check MD to see if there are any mutations ###
                        SNP_genomic_coord, MD_bool = check_MD_filter(
                            aln, region_start, region_end)
                        print SNP_genomic_coord
                        ### check if subtitution within pcr region???###
                        if MD_bool == True and [
                                s[0]
                                for s in SNP_genomic_coord if s[1] != 43737486
                        ] != []:
                            substitution_read += 1
    return Total_read, ODNS_inserted, INDEL_read, substitution_read
Esempio n. 10
0
def main():
    ## first group
    group1_f = args.group1
    file_gtf = open(name_gtf, 'r')
    gff_file = HTSeq.GFF_Reader(file_gtf)

    ##### Sanity Check
    samfile = HTSeq.SAM_Reader(group1_f[0])
    is_chr_sam = None
    set_chr_gff = set()
    for almnt in samfile:
        is_chr_sam = almnt.iv.chrom
        break
    for feature in gff_file:
        set_chr_gff.add(feature.iv.chrom)

    if is_chr_sam not in set_chr_gff:
        sys.stderr.write(
            "Error: Chromosome id in SAM files and GFF file does not agree!!\n"
        )
        sys.exit(1)
    #####

    file_gtf = open(name_gtf, 'r')
    gff_file = HTSeq.GFF_Reader(file_gtf)
    counts1 = meanVar(group1_f, gff_file, 'group1')
    file_gtf.close()
    ## second group
    group2_f = args.group2
    file_gtf = open(name_gtf, 'r')
    gff_file = HTSeq.GFF_Reader(file_gtf)
    counts2 = meanVar(group2_f, gff_file, 'group2')

    file_gtf.close()
    merged_count = {
        k1: v1 + v2
        for (k1, v1) in counts1.iteritems()
        for (k2, v2) in counts2.iteritems() if k1 == k2
    }
    non0_exp_list = list()
    for k, v in sorted(merged_count.iteritems()):
        #print k,v
        value = np.array(v)
        if len(value[value == 0]) == 0:
            non0_exp_list.append(k)
            #print k,v

    num_non0 = len(non0_exp_list)
    sys.stderr.write(
        "randomly choose %d out of %d genes as the final target genes that are to undergo AS\n"
        % (NTARG, num_non0))
    l = random.sample(xrange(num_non0), NTARG)
    out = open('AS_genes_list.txt', 'w')
    for i in l:
        out.write(non0_exp_list[i] + "\n")
    out.close()
Esempio n. 11
0
def load_sam_or_bam(sam_filename):
    sambase, samext = os.path.splitext(sam_filename)
    if samext == ".sam":
        align_seq = iter(HTSeq.SAM_Reader(sam_filename))
    elif samext == ".bam":
        align_seq = iter(HTSeq.BAM_Reader(sam_filename))
    else:
        print >> sys.stderr, "Problem with SAM/BAM File:", sam_filename
        sys.exit(1)
    return align_seq
Esempio n. 12
0
def htseq_reader(align_file):
    """
    returns a read-by-read sequence reader for a BAM or SAM file
    """
    if bam.is_sam(align_file):
        read_seq = HTSeq.SAM_Reader(align_file)
    elif bam.is_bam(align_file):
        read_seq = HTSeq.BAM_Reader(align_file)
    else:
        logger.error("%s is not a SAM or BAM file" % (align_file))
        sys.exit(1)
    return read_seq
Esempio n. 13
0
def htcount(samfile):
    sam = HTSeq.SAM_Reader(samfile)
    ref2cnt = defaultdict(int)
    i = 0
    for hit in sam:
        i += 1
        if hit.aligned:
            chrom = hit.iv.chrom
            ref2cnt[chrom] += 1
        if i % 1000000 == 0:
            sys.stderr.write("%d\n" % i)

    return ref2cnt
Esempio n. 14
0
    def test_htseq(self):
        """htseq basic test for sam file reading"""

        import HTSeq
        samfile = os.path.join(base.datadir, 'test.sam')
        sam = HTSeq.SAM_Reader(samfile)
        f = []
        for a in sam:
            if a.aligned == True:
                seq = a.read.seq.decode()
                f.append((seq, a.read.name, a.iv.chrom))
        df = pd.DataFrame(f, columns=['seq', 'read', 'name'])
        #print (df)
        return
Esempio n. 15
0
    def __create_genomic_signals(self,
                                 stranded=True,
                                 func=None,
                                 use_wrappers=True):
        """Prepares coverage as a HTSeq.GenomicArray

        :param filepath: path to file
        :param filetype: type of the file (can be bed etc.)
        """
        stderr.write("Creating %s signal. It may take few minutes...\n" %
                     self.name)
        self.coverage = HTSeq.GenomicArray("auto",
                                           stranded=stranded,
                                           typecode="d")
        self.library_size = 0
        if self.filetype.upper() == "BED":
            if use_wrappers:
                self.coverage = BedWrapper(self.filepath)
            else:
                for line in HTSeq.BED_Reader(self.filepath):
                    self.coverage[line.iv] += 1
                    self.library_size += 1
        elif self.filetype.upper() == "GFF" or self.filetype.upper() == "GTF":
            for line in HTSeq.GFF_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif self.filetype.upper() == "SAM":
            for line in HTSeq.SAM_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif self.filetype.upper() == "BAM":
            if use_wrappers:
                raise NotImplementedError(
                    "Bam wrapper is not yet implemented!")
                self.coverage = BamWrapper(self.filetype)
            for line in HTSeq.BAM_Reader(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        elif (self.filetype.upper() == "BG") or (self.filetype.upper()
                                                 == "BEDGRAPH"):
            raise NotImplementedError("BedGraph is not yet implemented!")
        elif (self.filetype.upper() == "BW") or (self.filetype.upper()
                                                 == "BIGWIG"):
            self.coverage = BigWigWrapper(self.filepath)
        elif self.filetype.upper() == "OTHER":
            for line in func(self.filepath):
                self.coverage[line.iv] += 1
                self.library_size += 1
        else:
            assert False, "I should not be here!"
Esempio n. 16
0
def set_up_IO(fileIN, fileOUT, gff, downstream, upstream):
    '''Function that will open all the file required for the alignment processing
    '''
    ## Open alignment
    alignIN = HTSeq.SAM_Reader(fileIN)
    alignIN = HTSeq.bundle_multiple_alignments(alignIN)

    ## Open GFF file
    annotation = HTSeq.GFF_Reader(gff, end_included=True)

    ## Open output file - write the header
    countTable = open(fileOUT, 'w')
    coordinates = '\t'.join(i for i in map(str, range(-upstream, downstream)))
    countTable.write('name\t{coord}\n'.format(coord=coordinates))
    return alignIN, annotation, countTable
def aln_generator_from_single_samfile(samfile):
    """ Generator - read SAM-format file (can have multiple alignments per read), yield (readname,alignment_object_list) tuples.
    """
    curr_readname, curr_aln_list = '', []
    # go over alignments, adding to curr_aln_list until the readname changes - then yield finished data and start new one.
    for aln in HTSeq.SAM_Reader(samfile):
        readname = aln.read.name
        if readname == curr_readname:
            curr_aln_list.append(aln)
        else:
            if curr_readname or curr_aln_list:
                yield (curr_readname, curr_aln_list)
            curr_readname, curr_aln_list = readname, [aln]
    # remember to yield the last result too!
    if curr_readname or curr_aln_list:
        yield (curr_readname, curr_aln_list)
Esempio n. 18
0
def analyze_single_deletion_reads(input_sam_file):
    input_sam = HTSeq.SAM_Reader(input_sam_file)
    input_sam = HTSeq.pair_SAM_alignments(input_sam)
    Clipping_reads_1 = set()
    Mapping_reads_1 = set()
    Clipping_reads_2 = set()
    Mapping_reads_2 = set()
    for sam_line in input_sam:
        if (sam_line[0] is not None
                and sam_line[0].aligned) and (sam_line[1] is not None
                                              and sam_line[1].aligned):
            clipping_1 = 0
            for cigar_line_1 in sam_line[0].cigar:
                if cigar_line_1.type == "N":
                    Clipping_reads_1.add(
                        "@" + sam_line[0].get_sam_line().split("\t")[0])
                    clipping_1 += 1
                elif cigar_line_1.type == "D" and cigar_line_1.size > 2:
                    Clipping_reads_1.add(
                        "@" + sam_line[0].get_sam_line().split("\t")[0])
                    clipping_1 += 1
            if clipping_1 == 0:
                Mapping_reads_1.add("@" +
                                    sam_line[0].get_sam_line().split("\t")[0])

            clipping_2 = 0
            for cigar_line_2 in sam_line[1].cigar:
                if cigar_line_2.type == "N":
                    Clipping_reads_2.add(
                        "@" + sam_line[1].get_sam_line().split("\t")[0])
                    clipping_2 += 1
                elif cigar_line_2.type == "D" and cigar_line_2.size > 2:
                    Clipping_reads_2.add(
                        "@" + sam_line[1].get_sam_line().split("\t")[0])
                    clipping_2 += 1
            if clipping_2 == 0:
                Mapping_reads_2.add("@" +
                                    sam_line[1].get_sam_line().split("\t")[0])

    Clipping_reads = Clipping_reads_1.union(Clipping_reads_2)
    Mapping_reads = Mapping_reads_1.intersection(Mapping_reads_2)
    kept_reads = Clipping_reads.difference(Mapping_reads)
    return kept_reads
Esempio n. 19
0
def get_aligned_reads(samfile, collapsed=None, readcounts=None):
    """Get all aligned reads from a sam file into a pandas dataframe"""

    sam = HTSeq.SAM_Reader(str(samfile))
    f=[]
    for a in sam:
        if a.aligned == True:
            seq = a.read.seq.decode()
            f.append((seq,a.read.name,a.iv.chrom,a.iv.start,a.iv.end,a.iv.strand))
        #else:
        #    f.append((seq,a.read.name,'_unmapped'))
    counts = pd.DataFrame(f, columns=['seq','read','name','start','end','strand'])
    counts['length'] = counts.seq.str.len()
    counts = counts.drop(['read'],1)
    if collapsed is not None:
        readcounts = read_collapsed_file(collapsed)
    if readcounts is not None:
        counts = counts.merge(readcounts, on='seq')
        counts['align_id'] = counts.index
    return counts
Esempio n. 20
0
def getAllMappedReadsSam(annot_reads, htseq_no_ambiguous = False):
    ''' creates a map with the read names that are annotated and mapped and 
        their mapping scores,chromosome and gene
        We assume the gtf file has its gene ids replaced by gene names
    '''
    filter = ["no_feature","ambiguous",
              "too_low_aQual","not_aligned",
              "alignment_not_unique"]
    
    mapped = dict()
    sam = HTSeq.SAM_Reader(annot_reads)
    
    for alig in sam:
        
        gene_name = str(alig.optional_field("XF"))
        if gene_name in filter or not alig.aligned or \
        (htseq_no_ambiguous and gene_name.find("ambiguous") != -1):
            continue
        
        strand = str(alig.pe_which)
        name = str(alig.read.name) 
        #seq = str(alig.read.seq)
        #qual = str(alig.read.qualstr)
        mapping_quality = int(alig.aQual)
        
        if alig.mate_start:
            chromosome = alig.mate_start.chrom 
        else:
            chromosome = "Unknown"
            
        if strand == "first":
            name += "/1"
        elif strand == "second":
            name += "/2"
        else:
            print "Warning : un-strander read " + str(name)
            continue ## not possible
        
        mapped[name] = (mapping_quality,gene_name,chromosome)  # there should not be collisions
        
    return mapped
Esempio n. 21
0
def readcount(myfile):
    almnt_file = HTSeq.SAM_Reader(myfile)
    counts = collections.Counter()
    for almnt in almnt_file:
        if not almnt.aligned:
            counts["_unmapped"] += 1
            continue

        gene_ids = set()

        for iv, val in exons[almnt.iv].steps():
            gene_ids |= val

        if len(gene_ids) == 1:
            gene_id = list(gene_ids)[0]
            counts[gene_id] += 1
        elif len(gene_ids) == 0:
            counts["_no_feature"] += 1
        else:
            counts["_ambiguous"] += 1

    return myfile, counts
Esempio n. 22
0
def analyze_merged_deletion_reads(input_sam_file):
    input_sam = HTSeq.SAM_Reader(input_sam_file)
    Clipping_reads = set()
    Mapping_reads = set()

    for sam_line in input_sam:
        if sam_line.aligned:
            clipping = 0
            for cigar_line in sam_line.cigar:
                if cigar_line.type == "N":
                    Clipping_reads.add("@" +
                                       sam_line.get_sam_line().split("\t")[0])
                    # insert_size = cigar_line.size
                    clipping += 1
                elif cigar_line.type == "D" and cigar_line.size > 2:
                    Clipping_reads.add("@" +
                                       sam_line.get_sam_line().split("\t")[0])
                    clipping += 1
            if clipping == 0:
                Mapping_reads.add("@" + sam_line.get_sam_line().split("\t")[0])

    kept_reads = Clipping_reads.difference(Mapping_reads)
    return kept_reads
Esempio n. 23
0
def count_tRNAs(samples, tRNA_features, output_path='tRNA-counts.tsv'):
    samples_with_counts = []
    with open(output_path, 'w') as f:
        f.write('sample\ttRNA\tcount\tfraction\n')
    for sample in samples:
        print(f'counting tRNAs in {sample["sample"]}')
        sample['counts'] = collections.Counter()
        tRNA_count_total = 0
        almnt_file = HTSeq.SAM_Reader(sample['sam_path'])
        for almnt in almnt_file:
            if not almnt.aligned:
                sample['counts']['_unmapped'] += 1
                continue
            aligned_tRNA_IDs = set()
            for iv, val in tRNA_features[almnt.iv].steps():
                aligned_tRNA_IDs |= val  # constructs a set of all tRNA IDs which the alignment could map to
            if len(aligned_tRNA_IDs) == 1:
                tRNA_ID = list(aligned_tRNA_IDs)[0]
                sample['counts'][tRNA_ID] += 1
                tRNA_count_total += 1
            elif len(aligned_tRNA_IDs) == 0:
                sample['counts']['_not_tRNA'] += 1
            else:
                sample['counts']['unk'] += 1  # ambiguous
                tRNA_count_total += 1

        sample['fractions'] = {}
        for tRNA, count in sorted(sample['counts'].items()):
            fraction = round(count / tRNA_count_total, 5)
            sample['fractions'][tRNA] = fraction
            if tRNA != '_unmapped' and tRNA != '_not_tRNA':
                with open(output_path, 'a') as f:
                    f.write(
                        f'{sample["sample"]}\t{tRNA}\t{str(count)}\t{str(fraction)}\n'
                    )
        samples_with_counts.append(sample)
    return samples_with_counts
Esempio n. 24
0
def count_reads(start_codon_sites, stop_codon_sites, ORF_features, counts,
                map_file, stranded, min_quality, count_mode,
                first_exclude_codons, last_exclude_codons, min_read, max_read,
                exclude_min_ORF):

    lowqual = 0
    notaligned = 0
    nonunique = 0
    too_short = 0
    too_long = 0
    min_read_string = "__too_short(<%i)" % min_read
    max_read_string = "__too_long(<%i)" % max_read
    first_exclude_nt = first_exclude_codons * 3
    last_exclude_nt = last_exclude_codons * 3

    pysam_fh = pysam.AlignmentFile(map_file)
    is_bam = pysam_fh.is_bam
    pysam_fh.close()
    if is_bam:
        tracks = HTSeq.BAM_Reader(map_file)
    else:
        tracks = HTSeq.SAM_Reader(map_file)
    # for i,r in enumerate(tracks):
    for r in tracks:
        # if i % 100000 == 0:
        # 	sys.stderr.write("%d alignment record processed.\r" % i)
        if not r.aligned:
            notaligned += 1
            continue
        try:
            if r.optional_field("NH") > 1:
                nonunique += 1
                continue
        except KeyError:
            pass
        if r.aQual < min_quality:
            lowqual += 1
            continue
        read_len = len(r.read.seq)
        if read_len < min_read:
            too_short += 1
            continue
        if read_len > max_read:
            too_long += 1
            continue
        if stranded != "reverse":
            iv_seq = (co.ref_iv for co in r.cigar
                      if co.type == "M" and co.size > 0)
        else:
            iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                      if co.type == "M" and co.size > 0)

        try:
            if count_mode == "intersection-strict":
                fs = None
                for iv in iv_seq:
                    for iv2, fs2 in ORF_features[iv].steps():
                        if fs is None:
                            fs = fs2.copy()
                        else:
                            fs = fs.intersection(fs2)
            elif count_mode == "union":
                fs = set()
                for iv in iv_seq:
                    for iv2, fs2 in ORF_features[iv].steps():
                        fs = fs.union(fs2)
            if fs is None or len(fs) == 0:
                continue
            elif len(fs) > 1:
                continue
            else:
                orf_id = list(fs)[0]
                if read_len < exclude_min_ORF:
                    counts[orf_id] += 1
                    continue
                try:
                    if abs(start_codon_sites[orf_id] -
                           r.iv.start_d) < first_exclude_nt:
                        continue
                    elif abs(r.iv.end_d -
                             stop_codon_sites[orf_id]) < last_exclude_nt:
                        continue
                    else:
                        counts[orf_id] += 1
                except:
                    counts[orf_id] += 1
        except:
            sys.stderr.write(
                "Error occurred when processing mapping file in line:%s\n" %
                r.get_sam_line())
    counts["__too_low_quality"] += lowqual
    counts["__not_aligned"] += notaligned
    counts[min_read_string] += too_short
    counts[max_read_string] += too_long
    counts["__alignment_not_unique"] += nonunique

    return counts
def sciRNAseq_count(sample, input_folder, exons, genes, gene_end, gene_annotat,
                    sample_ID):
    input_sam = input_folder + "/" + sample + ".sam"
    report = input_folder + "/" + sample + ".report"
    count_output = input_folder + "/" + sample + ".count"

    counts = collections.Counter()
    sam_file = input_sam
    almnt_file = HTSeq.SAM_Reader(sam_file)
    sam_name = sample
    cell_ID = sample_ID.index(sample) + 1

    perfect_inter_exon = 0
    nearest_inter_exon = 0
    perfect_combine_exon = 0
    nearest_combine_exon = 0
    perfect_inter_gene = 0
    nearest_inter_gene = 0
    perfect_combine_gene = 0
    nearest_combine_gene = 0

    print("Start read the input file: " + sam_file + "....")

    for alnmt in almnt_file:
        #print alnmt
        if not alnmt.aligned:
            counts["_unmapped"] += 1
            continue

        if alnmt.iv.chrom not in genes.chrom_vectors:
            counts["_unmapped"] += 1
            continue

        # First check the intersectin with exons
        gene_id_intersect = set()
        gene_id_combine = set()
        inter_count = 0
        for cigop in alnmt.cigar:
            if cigop.type != "M":
                continue

            for iv, val in exons[cigop.ref_iv].steps():
                #print iv, val
                gene_id_combine |= val
                if inter_count == 0:
                    gene_id_intersect |= val
                    inter_count += 1
                else:
                    gene_id_intersect &= val
                #print "intersect set:", gene_id_intersect
                #print "combine set:", gene_id_combine
        # first check the intersection set
        if len(gene_id_intersect) == 1:
            gene_id = list(gene_id_intersect)[0]
            counts[gene_id] += 1
            perfect_inter_exon += 1
        elif len(gene_id_intersect) > 1:
            gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_intersect,
                                        gene_end)
            counts[gene_id] += 1
            nearest_inter_exon += 1
        else:
            # if there no intersection match, then find the union sets
            if len(gene_id_combine) == 1:
                gene_id = list(gene_id_combine)[0]
                counts[gene_id] += 1
                perfect_combine_exon += 1
            elif len(gene_id_combine) > 1:
                gene_id = find_nearest_gene(alnmt.iv.end_d, gene_id_combine,
                                            gene_end)
                counts[gene_id] += 1
                nearest_combine_exon += 1
            else:
                # if there is no intersection match or union match, then search for genes to find the intronic match
                gene_id_intersect = set()
                gene_id_combine = set()
                inter_count = 0
                for cigop in alnmt.cigar:
                    if cigop.type != "M":
                        continue
                    for iv, val in genes[cigop.ref_iv].steps():
                        gene_id_combine |= val
                        if inter_count == 0:
                            gene_id_intersect |= val
                            inter_count += 1
                        else:
                            gene_id_intersect &= val

                if len(gene_id_intersect) == 1:
                    gene_id = list(gene_id_intersect)[0] + "_intron"
                    counts[gene_id] += 1
                    perfect_inter_gene += 1

                elif len(gene_id_intersect) > 1:
                    gene_id = find_nearest_gene(alnmt.iv.end_d,
                                                gene_id_intersect,
                                                gene_end) + "_intron"
                    counts[gene_id] += 1
                    nearest_inter_gene += 1

                else:
                    # if there no intersection match, then find the union sets
                    if len(gene_id_combine) == 1:
                        gene_id = list(gene_id_combine)[0] + "_intron"
                        counts[gene_id] += 1
                        perfect_combine_gene += 1

                    elif len(gene_id_combine) > 1:
                        gene_id = find_nearest_gene(alnmt.iv.end_d,
                                                    gene_id_combine,
                                                    gene_end) + "_intron"
                        counts[gene_id] += 1
                        nearest_combine_gene += 1

                    else:
                        counts["_no_feature"] += 1

    print("File name: ", sam_file)
    print("1: Perfect intersect exon match: ", perfect_inter_exon)
    print("2: Nearest intersect exon match: ", nearest_inter_exon)
    print("3: Perfect combine exon match: ", perfect_combine_exon)
    print("4: Nearest combine exon match: ", nearest_combine_exon)
    print("5: Perfect intersect gene match: ", perfect_inter_gene)
    print("6: Nearest intersect gene match: ", nearest_inter_gene)
    print("7: Perfect combine gene match: ", perfect_combine_gene)
    print("8: Nearest combine gene match: ", nearest_combine_gene)
    print("9: ambiguous match for exons: ", counts["_ambiguous"])
    print("10: ambiguous match for genes: ", counts["_ambiguous_intron"])
    print("11: No match: ", counts["_no_feature"])
    print("Sam file analysis finished~")

    with open(report, 'w') as report:
        report.write("1" + "," + str(cell_ID) + "," + str(perfect_inter_exon) +
                     "\n")
        report.write("2" + "," + str(cell_ID) + "," + str(nearest_inter_exon) +
                     "\n")
        report.write("3" + "," + str(cell_ID) + "," +
                     str(perfect_combine_exon) + "\n")
        report.write("4" + "," + str(cell_ID) + "," +
                     str(nearest_combine_exon) + "\n")
        report.write("5" + "," + str(cell_ID) + "," + str(perfect_inter_gene) +
                     "\n")
        report.write("6" + "," + str(cell_ID) + "," + str(nearest_inter_gene) +
                     "\n")
        report.write("7" + "," + str(cell_ID) + "," +
                     str(perfect_combine_gene) + "\n")
        report.write("8" + "," + str(cell_ID) + "," +
                     str(nearest_combine_gene) + "\n")
        report.write("9" + "," + str(cell_ID) + "," +
                     str(counts["_ambiguous"]) + "\n")
        report.write("10" + "," + str(cell_ID) + "," +
                     str(counts["_ambiguous_intron"]) + "\n")
        report.write("11" + "," + str(cell_ID) + "," +
                     str(counts["_no_feature"]) + "\n")

    with open(count_output, 'w') as count_output:
        for gene in counts:
            if (gene in [
                    "_unmapped", "_ambiguous", "_ambiguous_intron",
                    "_no_feature"
            ]):
                continue
            else:
                line = str(gene_annotat.loc[gene, 4]) + "," + str(
                    cell_ID) + "," + str(counts[gene]) + "\n"
                count_output.write(line)
    return 0
Esempio n. 26
0
#sort you SAM file by read ID, so that multiple mappings are in adjacent lines and the write a script to filter the best one
#Written by Simon Anders
import sys, re
import HTSeq

insam = HTSeq.SAM_Reader(sys.stdin)

# Go through all reads, with their alignments bundled up:
for bundle in HTSeq.bundle_multiple_alignments(insam):
    bestAlmt = None
    # Go through all alignments of a given read, looking
    # for the one with the best alignment score
    for almt in bundle:
        if bestAlmt is None:
            bestAlmt = almt
        elif almt.aQual > bestAlmt.aQual:
            bestAlmt = almt
        elif almt.aQual == bestAlmt:
            # If there are more than one best alignment,
            # better skip the read
            bestAlmt = None
    if bestAlmt is not None:
        # Change the NH field to 1 and print the line
        print re.sub("NH:i:\d+", "NH:i:1", bestAlmt.original_sam_line)

#call this script with the command sort samfile.sam | python chooseBest.py > filtered.sam
sys.stdout.flush()

if args.stranded == 'yes':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=True)
elif args.stranded == 'no':
    feature_array = hts.GenomicArrayOfSets("auto", stranded=False)

for feature in gtf:
    if feature.type == args.type:
        feature_array[feature.iv] += feature.name

print "done.\n\n"

# create Reader class for samfile:
if args.format == 'sam':
    alnmt_file = hts.SAM_Reader(args.alignment_file[0])
else:
    alnmt_file = hts.BAM_Reader(args.alignment_file[0])

# count reads:
print "Counting reads..."

if args.read_type == 'single_end':
    counts = ungapped_se_counter(alnmt_file, feature_array)

    print "\nSample output for ungapped SE counts:"
    countlist = sorted(counts.items())
    for g, c in countlist[-10:]:
        print "%-10s %d" % (g, c)
else:
    counts = ungapped_pe_counter(alnmt_file, feature_array)
Esempio n. 28
0
import sys
import matplotlib.pyplot as plt
if len(sys.argv) < 3:
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
if  not (input_file.endswith(".sam") and output_file.endswith(".fastq")):
    print("Please enter input file (.sam) and output file (.fastq)!")
    exit()
import HTSeq
import numpy as np
alignment_file = HTSeq.SAM_Reader(input_file)
len_reads=[]
my_fastq_file = open( output_file, "w" )
for aln in alignment_file:
    if not aln.aligned:
        len_reads.append(len(aln.read.seq))
        if len(aln.read.seq)>200:
            myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr )
            myread.write_to_fastq_file( my_fastq_file )
my_fastq_file.close()
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(len_reads, bins=10)
plt.savefig(output_file+".png")
Esempio n. 29
0

#We need this little helper below:
def reverse_strand(s):
    if s == "+":
        return "-"
    elif s == "-":
        return "+"
    else:
        raise SystemError, "illegal strand"


# Now go through the aligned reads

if not is_BAM:
    tmp_obj = HTSeq.SAM_Reader(sam_file)
else:
    tmp_obj = HTSeq.BAM_Reader(sam_file)

if not is_PE:

    num_reads = 0
    #   for a in HTSeq.SAM_Reader( sam_file ):
    for a in tmp_obj:
        if not a.aligned:
            counts['_notaligned'] += 1
            continue
        if a.aQual < minaqual:
            counts['_lowaqual'] += 1
            continue
        rs = set()
Esempio n. 30
0
#We need this little helper below:
def reverse_strand( s ):
   if s == "+":
      return "-"
   elif s == "-":
      return "+"
   else:
      raise SystemError, "illegal strand"

# Now go through the aligned reads

if not is_PE:

   num_reads = 0
   for a in HTSeq.SAM_Reader( sam_file ):
      if not a.aligned:
         counts[ '_notaligned' ] += 1
         continue
      if a.aQual < minaqual:
         counts[ '_lowaqual' ] += 1
         continue
      rs = set()
      for cigop in a.cigar:
         if cigop.type != "M":
            continue
         if reverse:
            cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
         for iv, s in features[cigop.ref_iv].steps( ):
            rs = rs.union( s )
      set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )