def bam_variant_aln(args): samfile = Samfile(args.bam) for rec in args.vcf: fp = open(vcf) reader = pyvcf.Reader(fp) self.positions = [] for rec in samfile.fetch(vcf): samfile.getrname(rec.tid) rec
def single_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for single_sam in to_process: sam = Samfile(single_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close() return match
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") #Iterates over each read instead of each contig outputs = defaultdict(list) #import ipdb; ipdb.set_trace() for aln in samfile.fetch(until_eof = True): ref = samfile.getrname(aln.tid) outputs[ref].append(aln) for ref, alns in outputs.iteritems(): print_reads(alns, ref, samfile.header)
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for al in bam: chrom = bam.getrname(al.rname) start = al.pos end = al.aend name = al.qname for hit in rmsk.search(chrom, start, end): print chrom, start, end, name, print hit.chrom, hit.start, hit.end, hit.name
def paired_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for paired_sam in to_process: r1_match = {} r2_match = {} sam = Samfile(paired_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) # print query_name, align_len, query_aligned_len nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= 90: if align.is_read1: r1_match.setdefault(query_name, {}) r1_match[query_name].setdefault(ref_name, []) r1_match[query_name][ref_name].append( paired_perc_id) if align.is_read2: r2_match.setdefault(query_name, {}) r2_match[query_name].setdefault(ref_name, []) r2_match[query_name][ref_name].append( paired_perc_id) sam.close() for query in set(r1_match.keys()).intersection(set(r2_match.keys())): for ref in set(r1_match[query].keys()).intersection( r2_match[query].keys()): average_perc_id = calcola_media( [max(r1_match[query][ref]), max(r2_match[query][ref])]) if average_perc_id >= identity_threshold: match.setdefault(query, set()) match[query].add(ref) return match
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i, hit])
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i,hit])
def _bowtie2_filter(fnam, fastq_path, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files """ try: fhandler = Samfile(fnam) except IOError: raise Exception('ERROR: file "%s" not found' % fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads unmap_out = open(unmap_out, 'w') map_out = open(map_out, 'w') fastq_in = open(fastq_path , 'r') for line in fhandler: line_in = fastq_in.readline() if line.is_unmapped or line.mapq < 4: read = '%s\t%s\t%s\t%s\t%s\n' % ( line_in.split('\t', 1)[0].rstrip('\n')[1:], line.seq, line.qual, '-', '-' ) unmap_out.write(read) else: read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % ( line.qname, line.seq, line.qual, '1', crm_dict[line.tid], '-' if line.is_reverse else '+', line.pos + 1, len(line.seq)) map_out.write(read) for _ in range(3): fastq_in.readline() unmap_out.close() map_out.close() fastq_in.close()
def _sam_filter(fnam, fastq_path, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files """ try: fhandler = Samfile(fnam) except IOError: raise Exception('ERROR: file "%s" not found' % fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads unmap_out = open(unmap_out, 'w') map_out = open(map_out, 'w') fastq_in = open(fastq_path, 'r') for line in fhandler: line_in = fastq_in.readline() if line.is_unmapped or line.mapq < 4: read = '%s\t%s\t%s\t%s\t%s\n' % (line_in.split( '\t', 1)[0].rstrip('\n')[1:], line.seq, line.qual, '-', '-') unmap_out.write(read) else: read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % ( line.qname, line.seq, line.qual, '1', crm_dict[line.tid], '-' if line.is_reverse else '+', line.pos + 1, len(line.seq)) map_out.write(read) for _ in range(3): fastq_in.readline() unmap_out.close() map_out.close() fastq_in.close()
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading SAM file from %s: %s' % (mapper, fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = tmp_reads_fh.next() except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs): """ Parse gem 3c sam file using pysam tools. :param f_name: path to sam file corresponding to the mapping of reads :param out_file: path to outfile tab separated format containing paired read information :param genome_lengths: a dictionary generated containing the length of the genomic sequence per chromosome :param False tmp_format: If True leave the file prepared to be merged with other map files. """ frag_chunk = kwargs.get('frag_chunk', 100000) try: fhandler = Samfile(f_name) except IOError: raise Exception('ERROR: file "%s" not found' % f_name) # max number of reads in buffer max_size = 1000000 # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 nfile = 0 tmp_files = [] reads = [] cur_name = '' write_pairs = False read1 = None read2 = [] samiter = fhandler.fetch(until_eof=True) r = None try: r = next(samiter) except StopIteration: # empty SAM file return None pass while r: if not r.is_paired or r.is_unmapped or r.mapq < 4: try: r = next(samiter) except StopIteration: break continue if r.is_read1 and cur_name != r.qname: if read1 is None: read1 = r cur_name = r.qname try: r = next(samiter) except StopIteration: break continue else: write_pairs = True if not write_pairs: if r.is_read2 or r.is_supplementary: read2.append(r) try: r = next(samiter) except StopIteration: break continue else: if not read2: write_pairs = False read1 = None try: r = next(samiter) except StopIteration: break continue reads_grp = [] read_id = read1.query_name for read in [read1]+read2: if read.query_name != read_id: continue positive = not read.is_reverse crm = crm_dict[read.tid] len_seq = read.reference_end-read.pos if positive: pos = read.pos + 1 else: pos = read.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash read_multi = [] break idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] reads_grp.append([read.tid, crm, pos, positive, len_seq, prev_re, next_re]) if len(reads_grp) > 2: _merge_multis(reads_grp) elif len(reads_grp) < 2: reads_grp = [] reads_multi = [] for paired_reads in combinations(reads_grp, 2): read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2])) for item in sublist] if read_multi: reads_multi.append(read_multi) sub_count += 1 paired_total = len(reads_multi) paired_nbr = 0 for pair_read in reads_multi: read_name_id = read_id paired_nbr += 1 if paired_total > 1: read_name_id += '#%d/%d' % (paired_nbr,paired_total) reads.append([read_name_id]+pair_read) if sub_count >= max_size: sub_count = 0 nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)+'\n') del reads[:] write_pairs = False read1 = None del read2[:] if reads: nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)) #map_out.close() # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True)) if verbose: stdout.write('\n') if tmp_format: os.rename(tmp_files[0], out_file) else: map_out = open(out_file, 'w') tmp_reads_fh = open(tmp_files[0],'rb') for crm in genome_lengths: map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm])) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]]+read[2:8]+read[9:])) map_out.close() os.system('rm -rf ' + tmp_files[0]) return out_file
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq, re_name, verbose=False, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`. """ frags = map_re_sites(re_name, genome_seq, verbose=True) frag_chunk = kwargs.get('frag_chunk', 100000) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for read in range(2): if verbose: print 'Loading read' + str(read + 1) reads = [] for fnam in fnames[read]: if verbose: print 'loading file:', fnam try: fhandler = Samfile(fnam) except IOError: continue i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i).replace('chr', '') i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if r.tags[1][1] != 1: continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) pos = r.pos + (0 if positive else len_seq) try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) prev_re = frag_piece[idx - 1] next_re = frag_piece[idx] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) reads_fh = open(outfiles[read], 'w') reads_fh.write(''.join(sorted(reads))) reads_fh.close() del(reads)
usage() sys.exit() acc2node = itsonedb2node(fasta_itesondb) if outfile is None: sys.exit("Output file option is missing") match = {} # mappiamo prima in modalita' glocal if single_sam is not None: if os.path.exists(single_sam): sam = Samfile(single_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing( align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= coverage: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ( (align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close()
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print('Searching and mapping RE sites to the reference genome') frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, basestring): f_names1 = [f_names1] if isinstance(f_names2, basestring): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print('Loading read' + str(read + 1)) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print('WARNING: file "%s" not found' % fnam) continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print('loading SAM file from %s: %s' % (mapper, fnam)) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print('Getting Multiple contacts') reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = next(tmp_reads_fh) except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() tmp_reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def _read_one_sam(fnam, mapper, verbose, frags, frag_chunk, num): out = open(fnam + '.tsv', 'w') lwindows = {} try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam return {}, [] except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping num = int(fnam.split('.')[-1].split(':')[0]) lwindows.setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading %s file: %s\n' % (mapper, fnam), # iteration over lreads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq + 1 try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname out.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) lwindows[num] += 1 out.close() return lwindows
def _read_one_sam(fnam, mapper, verbose, frags, frag_chunk, num): out = open(fnam + '.tsv', 'w') lwindows = {} try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam return {}, [] except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping num = int(fnam.split('.')[-1].split(':')[0]) lwindows.setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower() == 'gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading %s file: %s\n' % (mapper, fnam), # iteration over lreads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq + 1 try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname out.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (name, crm, pos, positive, len_seq, prev_re, next_re)) lwindows[num] += 1 out.close() return lwindows
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) reads = [] for fnam in fnames[read]: if verbose: print 'loading file:', fnam try: fhandler = Samfile(fnam) except IOError: continue # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'MAPPER:', mapper # iteration over reads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) pos = r.pos + (0 if positive else len_seq) try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write(''.join(sorted(reads))) reads_fh.close() del(reads)
errors2segments = defaultdict(lambda: defaultdict(list)); samfile = Samfile(args.path) for segment in samfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] for etype, eset in errors.iteritems(): if(num in eset): errors2segments[etype][num].append(segment); break; additional = defaultdict(list); for fname in args.additional: tsamfile = Samfile(fname); for segment in tsamfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid))) tsamfile.close(); for etype, d in errors2segments.iteritems(): with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f: for num, segments in d.iteritems(): if(segments[0].is_reverse): seq = reverse_complement(segments[0].seq); else: seq = segments[0].seq f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq)) for segment in segments:
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} reads = [] num = 0 for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading %s file: %s' % (mapper, fnam) # iteration over reads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq + 1 try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('## Number of mapped reads by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) reads_fh.write(''.join(sorted(reads))) reads_fh.close() del reads