def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2', verbose=False) self.assertEqual(len(ref_genome['chr4']), 1351857) frags = map_re_sites('dpnIi', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 16) self.assertEqual(frags['chr4'][10][50], 1018069) frags = map_re_sites('hindiii', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 3) self.assertEqual(frags['chr4'][10][5], 1017223)
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ if ONLY and ONLY != "17": return if CHKTIME: t0 = time() ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2", verbose=False) self.assertEqual(len(ref_genome["chr4"]), 1351857) frags = map_re_sites("dpnIi", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 16) self.assertEqual(frags["chr4"][10][50], 1018069) frags = map_re_sites("hindiii", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 3) self.assertEqual(frags["chr4"][10][5], 1017223) if CHKTIME: self.assertEqual(True, True) print "17", time() - t0
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ if ONLY and "17" not in ONLY: return if CHKTIME: t0 = time() ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2", verbose=False) self.assertEqual(len(ref_genome["chr4"]), 1351857) frags = map_re_sites("dpnIi", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 16) self.assertEqual(frags["chr4"][10][50], 1018069) frags = map_re_sites("hindiii", ref_genome) self.assertEqual(len(frags["chr2L"]), 231) self.assertEqual(len(frags["chr2L"][230]), 3) self.assertEqual(frags["chr4"][10][5], 1017223) if CHKTIME: self.assertEqual(True, True) print "17", time() - t0
def test_17_map_re_sites(self): """ test fasta parsing and mapping re sites """ if ONLY and ONLY != '17': return if CHKTIME: t0 = time() ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2', verbose=False) self.assertEqual(len(ref_genome['chr4']), 1351857) frags = map_re_sites('dpnIi', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 16) self.assertEqual(frags['chr4'][10][50], 1018069) frags = map_re_sites('hindiii', ref_genome) self.assertEqual(len(frags['chr2L']), 231) self.assertEqual(len(frags['chr2L'][230]), 3) self.assertEqual(frags['chr4'][10][5], 1017223) if CHKTIME: self.assertEqual(True, True) print '17', time() - t0
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading SAM file from %s: %s' % (mapper, fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = tmp_reads_fh.next() except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site. The position of reads mapped on reverse strand will be computed from the end of the read (original position + read length - 1) :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names2: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file2: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param True clean: remove temporary files required for indentification of multiple-contacts :param False compress: compress (gzip) input map files. This is done in the background while next MAP files are parsed, or while files are sorted. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter if verbose: print 'loading file: %s' % (fnam) # start parsing read_count = 0 try: while not False: for _ in xrange(max_size): try: reads.append(read_read(fhandler.next(), frags, frag_chunk)) except KeyError: # Chromosome not in hash continue read_count += 1 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) except StopIteration: fhandler.close() nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) windows[read][num] = read_count if kwargs.get('compress', False) and fnam.endswith('.map'): print 'compressing input MAP file' procs.append(Popen(['gzip', fnam])) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = tmp_reads_fh.next() except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param True clean: remove temporary files required for indentification of multiple-contacts """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} tmp_name = os.path.join(*outfiles[read].split('/')[:-1] + ['tmp_' + outfiles[read].split('/')[-1]]) tmp_name = ('/' * outfiles[read].startswith('/')) + tmp_name tmp_reads_fh = open(tmp_name, 'w') sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE, stdout=tmp_reads_fh) num = 0 for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) if verbose: print 'loading file: %s' % (fnam) # iteration over reads for r in fhandler: name, seq, _, _, ali = r.split('\t')[:5] crm, strand, pos = ali.split(':')[:3] positive = strand == '+' len_seq = len(seq) if positive: pos = int(pos) else: pos = int(pos) + len_seq - 1 # remove 1 because all inclusive try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] sorter.stdin.write('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 if verbose: print 'finishing to sort' sorter.communicate() tmp_reads_fh.close() if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) ## Multicontacts tmp_reads_fh = open(tmp_name) read = tmp_reads_fh.next() prev_head = read.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read for read in tmp_reads_fh: head = read.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: prev_read = prev_read.strip() + '|||' + read else: reads_fh.write(prev_read) prev_read = read prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name)
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq, re_name, verbose=False, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`. """ frags = map_re_sites(re_name, genome_seq, verbose=True) frag_chunk = kwargs.get('frag_chunk', 100000) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for read in range(2): if verbose: print 'Loading read' + str(read + 1) reads = [] for fnam in fnames[read]: if verbose: print 'loading file:', fnam try: fhandler = Samfile(fnam) except IOError: continue i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i).replace('chr', '') i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if r.tags[1][1] != 1: continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) pos = r.pos + (0 if positive else len_seq) try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) prev_re = frag_piece[idx - 1] next_re = frag_piece[idx] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) reads_fh = open(outfiles[read], 'w') reads_fh.write(''.join(sorted(reads))) reads_fh.close() del(reads)
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print('Searching and mapping RE sites to the reference genome') frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, basestring): f_names1 = [f_names1] if isinstance(f_names2, basestring): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print('Loading read' + str(read + 1)) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print('WARNING: file "%s" not found' % fnam) continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print('loading SAM file from %s: %s' % (mapper, fnam)) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print('Getting Multiple contacts') reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = next(tmp_reads_fh) except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() tmp_reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
def parse_map(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, **kwargs): """ Parse map files Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1, ) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} tmp_name = '/'.join(outfiles[read].split('/') [:-1]) + '/tmp_' + outfiles[read].split('/')[-1] tmp_reads_fh = open(tmp_name, 'w') sorter = Popen(['sort', '-k', '1,1', '-s', '-t', '\t'], stdin=PIPE, stdout=tmp_reads_fh) num = 0 for fnam in fnames[read]: try: fhandler = magic_open(fnam) except IOError: warn('WARNING: file "%s" not found\n' % fnam) continue # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) if verbose: print 'loading file: %s' % (fnam) # iteration over reads for r in fhandler: name, seq, _, _, ali = r.split('\t')[:5] crm, strand, pos = ali.split(':')[:3] positive = strand == '+' len_seq = len(seq) if positive: pos = int(pos) else: pos = int( pos) + len_seq - 1 # remove 1 because all inclusive try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] sorter.stdin.write( '%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 if verbose: print 'finishing to sort' sorter.communicate() tmp_reads_fh.close() if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) ## Multicontacts tmp_reads_fh = open(tmp_name) read = tmp_reads_fh.next() prev_head = read.split('\t', 1)[0] prev_read = read.strip() for read in tmp_reads_fh: head = read.split('\t', 1)[0] if head == prev_head: prev_read += '|||' + read.strip() else: reads_fh.write(prev_read + '\n') prev_read = read.strip() prev_head = head reads_fh.write(prev_read + '\n') reads_fh.close()
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) reads = [] for fnam in fnames[read]: if verbose: print 'loading file:', fnam try: fhandler = Samfile(fnam) except IOError: continue # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'MAPPER:', mapper # iteration over reads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) pos = r.pos + (0 if positive else len_seq) try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write(''.join(sorted(reads))) reads_fh.close() del(reads)
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows = {} reads = [] num = 0 for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 windows.setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][1] != 1 elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading %s file: %s' % (mapper, fnam) # iteration over reads i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq + 1 try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[num] += 1 reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('## Number of mapped reads by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) reads_fh.write(''.join(sorted(reads))) reads_fh.close() del reads
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, ncpus=1, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1, ) outfiles = (out_file1, ) sorting = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) + '\n', pool = mu.Pool(ncpus) jobs = [] num = 0 for fnam in fnames[read]: num += 1 jobs.append( pool.apply_async(_read_one_sam, args=(fnam, mapper, verbose, frags, frag_chunk, num))) pool.close() pool.join() windows = {} for w in jobs: w = w.get() for k in w: windows.setdefault(k, 0) windows[k] += w[k] reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('## Number of mapped reads by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) reads_fh.close() # write the rest of the file using bash to concatenate and sort # done asynchromeously list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]]) sort = ('sort -k1,1 %s ' % list_tsv) sorting.append(Popen(sort + '>> ' + outfiles[read], shell=True)) if verbose: print 'Sorting reads' for s in sorting: if s.wait() > 0: raise Exception('ERROR: problem sorting file\n') if verbose: print 'Removing temporary files...' for read in range(len(fnames)): list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]]) os.system('rm -f %s' % list_tsv)
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, mapper=None, ncpus=1, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) sorting = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) + '\n', pool = mu.Pool(ncpus) jobs = [] num = 0 for fnam in fnames[read]: num += 1 jobs.append(pool.apply_async(_read_one_sam, args=(fnam, mapper, verbose, frags, frag_chunk, num))) pool.close() pool.join() windows = {} for w in jobs: w = w.get() for k in w: windows.setdefault(k, 0) windows[k] += w[k] reads_fh = open(outfiles[read], 'w') ## write file header # chromosome sizes (in order) reads_fh.write('## Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('## Number of mapped reads by iteration\n') for size in windows: reads_fh.write('# MAPPED %d %d\n' % (size, windows[size])) reads_fh.close() # write the rest of the file using bash to concatenate and sort # done asynchromeously list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]]) sort = ('sort -k1,1 %s ' % list_tsv) sorting.append(Popen(sort + '>> ' + outfiles[read], shell=True)) if verbose: print 'Sorting reads' for s in sorting: if s.wait() > 0: raise Exception('ERROR: problem sorting file\n') if verbose: print 'Removing temporary files...' for read in range(len(fnames)): list_tsv = ' '.join([fnam + '.tsv' for fnam in fnames[read]]) os.system('rm -f %s' % list_tsv)
def fast_fragment_mapping(mapper_index_path, fastq_path1, fastq_path2, r_enz, genome_seq, out_map, clean=True, get_nread=False, mapper_binary=None, mapper_params=None, samtools='samtools', **kwargs): """ Maps FASTQ reads to an indexed reference genome with the knowledge of the restriction enzyme used (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool, bowtie2-build or hisat2-build :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not. :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not. :param out_map_dir: path to outfile tab separated format containing mapped read information. :param r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param False clean: remove intermediate files created in temp_dir :param False get_nread: returns a list of lists where each element contains a path and the number of reads processed :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :param samtools samtools: path to samtools binary. :returns: outfile with the intersected read pairs """ suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) samtools = which(samtools) # check out folder if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))): raise Exception( '\n\nERROR: Path to store the output does not exist.\n') temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) gem_version = None # check that we have the GEM binary: gem_binary = mapper_binary or 'gem-mapper' gem_binary = which(gem_binary) if not gem_binary: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') try: out, err = Popen([gem_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() gem_version = int(out[1]) except ValueError as e: gem_version = 2 print('Falling to gem v2') if gem_version < 3: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') if mapper_params: kwargs.update(mapper_params) # create directories for rep in [temp_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '') base_name1 = '.'.join(base_name1.split('.')[:-1]) curr_map1, _ = transform_fastq(fastq_path1, mkstemp(prefix=base_name1 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '') base_name2 = '.'.join(base_name2.split('.')[:-1]) curr_map2, count_fastq = transform_fastq(fastq_path2, mkstemp(prefix=base_name2 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) out_map_path = curr_map1 + '_frag%s.map' % (suffix) print('Mapping fragments of remaining reads...') _gem_mapping(mapper_index_path, curr_map1, out_map_path, fastq_path2=curr_map2, r_enz=r_enz, gem_binary=gem_binary, gem_version=gem_version, **kwargs) # clean if clean: print(' x removing GEM 3 input %s' % (curr_map1)) os.system('rm -f %s' % (curr_map1)) print(' x removing GEM 3 input %s' % (curr_map2)) os.system('rm -f %s' % (curr_map2)) #sort sam file os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' % (nthreads, out_map_path, out_map_path, out_map_path)) genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq) frag_chunk = kwargs.get('frag_chunk', 100000) frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk) if samtools and nthreads > 1: print('Splitting sam file') # headers for i in range(nthreads): os.system(samtools + ' view -H -O SAM %s > "%s_%d"' % (out_map_path, out_map_path, (i + 1))) chunk_lines = int( (count_fastq * 2.3) / nthreads) # estimate lines in sam with reads and frags os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" ' BEGIN { part=0; line=n } { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } else { print $0 >> "%s_"part; line++; } last_read = $1; }' ''' % (out_map_path, chunk_lines, out_map_path, out_map_path)) if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) print('Parsing results...') kwargs['nthreads'] = 1 procs = [] pool = mu.Pool(nthreads) for i in range(nthreads): frags_shared = copy.deepcopy(frags) procs.append( pool.apply_async(parse_gem_3c, args=('%s_%d' % (out_map_path, (i + 1)), '%s_parsed_%d' % (out_map_path, (i + 1)), copy.deepcopy(genome_lengths), frags_shared, False, True), kwds=kwargs)) #results.append('%s_parsed_%d' % (out_map_path,(i+1))) pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] if clean: for i in range(nthreads): print(' x removing tmp mapped %s_%d' % (out_map_path, (i + 1))) os.system('rm -f %s_%d' % (out_map_path, (i + 1))) #Final sort and merge nround = 0 while len(results) > 1: nround += 1 num_procs = min(nthreads, int(len(results) / 2)) pool = mu.Pool(num_procs) procs = [ pool.apply_async(merge_sort, (results.pop(0), results.pop(0), out_map_path + '_%d' % nround, i, True)) for i in range(num_procs) ] pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] map_out = open(out_map, 'w') tmp_reads_fh = open(results[0], 'r') for crm in genome_seq: map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]] + read[2:8] + read[9:])) map_out.close() if clean: print(' x removing tmp mapped %s' % results[0]) os.system('rm -f %s' % (results[0])) else: print('Parsing result...') parse_gem_3c(out_map_path, out_map, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs) # clean if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) if get_nread: return [(out_map, count_fastq)] return out_map