def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_map_read(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: cnt += 1 pos = seq.index(pattern) if pos < min_seq_len: split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt) else: yield seq[:pos] + site, qal[:pos] + ('H' * len(site)), cnt for subseq, subqal, cnt in split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt): yield subseq, subqal, cnt except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal, cnt # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1) # function to yield reads from input file get_seq = _get_fastq_read if fastq else _get_map_read ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = '' if add_site else enzyme for header in fhandler: header, seq, qal = get_seq(header) # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal, cnt = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '') try: seq, qal, cnt = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal, cnt in iter_frags: out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq + site, qal + 'H' * (len(site)), '0', '-\n')))) out.close() return out_name
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) elif fnam.endswith('.dsrc'): proc = Popen(['dsrc', 'd', '-t8', '-s', fnam], stdout=PIPE) fhandler = proc.stdout else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() return des, (ligep * 100.) / nreads
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True, light_storage=False, **kwargs): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ skip = kwargs.get('skip', False) ## define local functions to process reads and sequences def _get_fastq_read_heavy(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_fastq_read_light(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed return (rlines, seq.strip(), qal.strip()) def _get_map_read_heavy(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _get_map_read_light(line): header, seq, qal, _ = line.split('\t', 3) return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site='', cnt=0): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed and preceded by the RE site if a ligation site was found after the fragment. EXAMPLE: seq = '-------oGATCo========oGATCGATCo_____________oGATCGATCo~~~~~~~~~~~~' qal = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' should yield these fragments: -------oGATCo========oGATC xxxxxxxxxxxxxxxxxxxxxxHHHH GATCo_____________oGATC HHHHxxxxxxxxxxxxxxxHHHH GATCo~~~~~~~~~~~~ HHHHxxxxxxxxxxxxx """ cnt += 1 try: pos = seq.index(pattern) except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal, cnt return xqal = ('H' * len(site)) if pos < min_seq_len: split_read(site + seq[pos + len_relg:], xqal + qal[pos + len_relg:], pattern, max_seq_len, cnt=cnt) else: yield seq[:pos] + site, qal[:pos] + xqal, cnt new_pos = pos + len_relg for sseq, sqal, cnt in split_read(site + seq[new_pos:], xqal + qal[new_pos:], pattern, max_seq_len, site=site, cnt=cnt): yield sseq, sqal, cnt # Define function for stripping lines according to focus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enzyme = '' enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y , 1) # function to yield reads from input file if light_storage: get_seq = _get_fastq_read_light if fastq else _get_map_read_light insert_mark = insert_mark_light else: get_seq = _get_fastq_read_heavy if fastq else _get_map_read_heavy insert_mark = insert_mark_heavy ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - trimming reads %d-%d' % tuple(trim) counter = 0 if skip: if fastq: print ' ... skipping, only counting lines' counter = sum(1 for _ in magic_open(fastq_path, cpus=kwargs.get('nthreads'))) counter /= 4 if fastq else 1 print ' ' + fastq_path, counter, fastq return out_fastq, counter # open input file fhandler = magic_open(fastq_path, cpus=kwargs.get('nthreads')) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = enzyme if add_site else '' for header in fhandler: header, seq, qal = get_seq(header) counter += 1 # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal, cnt = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), '') try: seq, qal, cnt = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal, cnt in iter_frags: out.write(_map2fastq('\t'.join((insert_mark(header, cnt), seq, qal, '0', '-\n')))) out.close() return out_name, counter
def quality_plot(fnam, r_enz=None, nreads=None, axe=None, savefig=None, paired=False): """ Plots the sequencing quality of a given FASTQ file. If a restrinction enzyme (RE) name is provided, can also represent the distribution of digested and undigested RE sites and estimate an expected proportion of dangling-ends. Proportion of dangling-ends is inferred by counting the number of times a dangling-end site, is found at the beginning of any of the reads (divided by the number of reads). :param fnam: path to FASTQ file :param None nreads: max number of reads to read, not necesary to read all :param None savefig: path to a file where to save the image generated; if None, the image will be shown using matplotlib GUI (the extension of the file name will determine the desired format). :param False paired: is input FASTQ contains both ends :returns: the percentage of dangling-ends (sensu stricto) and the percentage of reads with at least a ligation site. """ phred = dict([(c, i) for i, c in enumerate( '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')]) quals = [] henes = [] sites = [] fixes = [] liges = [] ligep = 0 tkw = dict(size=4, width=1.5) if fnam.endswith('.gz'): fhandler = gopen(fnam) else: fhandler = open(fnam) if not r_enz: if nreads: while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except EOFError: break seq = next(fhandler) if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) else: r_site = RESTRICTION_ENZYMES[r_enz].replace('|', '') l_site = religated(r_enz) d_site = repaired(r_enz) if r_site*2 == l_site: # in case the religated site equals 2 restriction sites (like DnpII) site = re.compile('(?<!%s)' % r_site + r_site + '(?!%s)' % r_site) fixe = re.compile('(?<!%s)' % d_site + d_site + '(?!%s)' % d_site) else: site = re.compile(r_site) fixe = re.compile(d_site) lige = re.compile(l_site) if nreads: while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) if len(quals) > nreads: break else: # do this because it's faster while True: try: next(fhandler) except StopIteration: break seq = next(fhandler) sites.extend([m.start() for m in site.finditer(seq)]) fixes.extend([m.start() for m in fixe.finditer(seq)]) liges.extend([m.start() for m in lige.finditer(seq)]) ligep += l_site in seq if 'N' in seq: henes.extend([i for i, s in enumerate(seq) if s == 'N']) next(fhandler) line = next(fhandler) quals.append([phred[i] for i in line.strip()]) fhandler.close() if not nreads: nreads = len(quals) quals = zip(*quals) meanquals = [np.mean(q) for q in quals] errorquals = [np.std(q) for q in quals] if axe: ax = axe fig = axe.get_figure() ax2 = fig.add_subplot(212) else: if r_enz: _, (ax, ax2) = plt.subplots(2,1, figsize=(15, 12)) else: _, ax = plt.subplots(1,1, figsize=(15, 6)) ax.patch.set_facecolor('lightgrey') ax.patch.set_alpha(0.4) ax.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax.set_axisbelow(True) # remove tick marks ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False) ax.tick_params(axis='both', direction='out', top=False, right=False, left=False, bottom=False, which='minor') ax.errorbar(range(len(line.strip())), meanquals, linewidth=1, elinewidth=1, color='darkblue', yerr=errorquals, ecolor='orange') ax.set_xlim((0, len(line))) ax.set_xlabel('Nucleotidic position') ax.set_ylabel('PHRED score') ax.set_title('Sequencing Quality (%d reads)' % (nreads)) ax.yaxis.label.set_color('darkblue') ax.tick_params(axis='y', colors='darkblue', **tkw) axb = ax.twinx() axb.plot([henes.count(i) for i in xrange(len(line))], linewidth=1, color='black', linestyle='--') axb.yaxis.label.set_color('black') axb.tick_params(axis='y', colors='black', **tkw) axb.set_ylabel('Number of "N" per position') try: # no Ns found (yes... it happens) axb.set_yscale('log') axb.set_ylim((0, axb.get_ylim()[1] * 1000)) except ValueError: axb.set_yscale('linear') ax.set_ylim((0, ax.get_ylim()[1])) ax.set_xlim((0, len(line))) if r_enz: ax.set_title('Sequencing Quality and deconvolution (%s %d reads)' % ( r_enz, nreads)) ax.set_xlabel('') plt.setp(ax.get_xticklabels(), visible=False) ax2.patch.set_facecolor('lightgrey') ax2.patch.set_alpha(0.4) ax2.grid(ls='-', color='w', lw=1.5, alpha=0.6, which='major') ax2.grid(ls='-', color='w', lw=1, alpha=0.3, which='minor') ax2.set_axisbelow(True) ax2.set_xlabel('Nucleotidic position') seq_len = len(line) - max((len(r_site), len(l_site), len(d_site))) sites = [sites.count(k) for k in xrange(seq_len)] # Undigested liges = [liges.count(k) for k in xrange(seq_len)] # OK fixes = [fixes.count(k) for k in xrange(seq_len)] # DE if d_site in r_site: pos = r_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - sites[k-pos] for k in xrange(pos, seq_len)]) if d_site in l_site: pos = l_site.find(d_site) fixes = (fixes[:pos] + [fixes[k] - liges[k-pos] for k in xrange(pos, seq_len)]) site_len = max((len(r_site), len(l_site), len(d_site))) if paired: sites[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len liges[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len fixes[len(line) / 2 - site_len: len(line) / 2] = [float('nan')] * site_len ax2.plot(sites, linewidth=2, color='darkred') ax2.set_ylabel('Undigested RE site (%s)' % r_site) ax2.yaxis.label.set_color('darkred') ax2.tick_params(axis='y', colors='darkred', **tkw) ax3 = ax2.twinx() ax3.plot(liges, linewidth=2, color='darkblue') ax3.yaxis.label.set_color('darkblue') ax3.tick_params(axis='y', colors='darkblue', **tkw) ax3.set_ylabel('Religated (%s)' % l_site) if any([f > 0 for f in fixes]): ax4 = ax2.twinx() ax4.spines["right"].set_position(("axes", 1.07)) make_patch_spines_invisible(ax4) ax4.spines["right"].set_visible(True) ax4.plot(fixes, linewidth=2, color='darkorange') ax4.yaxis.label.set_color('darkorange') ax4.tick_params(axis='y', colors='darkorange', **tkw) ax4.set_ylabel('Dangling-ends (%s)' % d_site) else: ax2.set_ylabel('RE site & Dangling-ends (%s)' % r_site) ax2.set_xlim((0, len(line))) lig_cnt = (np.nansum(liges) - liges[0] - liges[len(line) / 2]) sit_cnt = (np.nansum(sites) - sites[0] - sites[len(line) / 2]) des = ((100. * (fixes[0] + (fixes[(len(line) / 2)] if paired else 0))) / nreads) if any([f > 0 for f in fixes]) else ( 100. * (sites[0] + (sites[(len(line) / 2)] if paired else 0))) / nreads plt.title(('Percentage of digested sites: %.0f%%, of dangling-ends: %.0f%%\n' + 'Percentage of reads with ligation site: %.0f%%') %( (100. * lig_cnt) / (lig_cnt + sit_cnt), des, (ligep * 100.) / nreads)) plt.subplots_adjust(right=0.85) if savefig: tadbit_savefig(savefig) plt.close('all') elif not axe: plt.show() return des, (ligep * 100.) / nreads
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, min_seq_len=20, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry """ rlines = rlines.rstrip('\n') line = fhandler.next() _ = fhandler.next() # lose qualities but not needed _ = fhandler.next() # lose qualities but not needed return rlines, line.strip() def _split_read_re(x, max_seq_len=None): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: pos = x.index(enz_pattern) if pos < min_seq_len: split_read(x[pos + len_relg:], max_seq_len) else: yield x[:pos] + enzyme for x in split_read(x[pos + len_relg:], max_seq_len): yield x except ValueError: if len(x) > min_seq_len: if len(x) == max_seq_len: raise StopIteration yield x # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % (r_enz, enz_pattern, enzyme) split_read = _split_read_re else: split_read = lambda x, y: (yield x) # function to yield reads from input file get_seq = _get_fastq_read if fastq else lambda x: x.split('\t', 2)[:2] ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them for header in fhandler: header, line = get_seq(header) # trim on wanted region of the read line = strip_line(line) # get the generator of restriction enzyme fragments iter_frags = split_read(line, len(line)) # the first fragment should not be preceded by the RE site try: frag = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue out.write('\t'.join((header, frag, 'H' * len(frag), '0', '-\n'))) # the next fragments should be preceded by the RE site for frag in iter_frags: out.write('\t'.join((header, frag + enzyme, 'H' * (len(frag) + len(enzyme)), '0', '-\n'))) out.close() return out_name
def transform_fastq(fastq_path, out_fastq, trim=None, r_enz=None, add_site=True, min_seq_len=15, fastq=True, verbose=True): """ Given a FASTQ file it can split it into chunks of a given number of reads, trim each read according to a start/end positions or split them into restriction enzyme fragments :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. """ ## define local funcitons to process reads and sequences def _get_fastq_read(rlines): """ returns header and sequence of 1 FASTQ entry Note: header also contains the sequence """ rlines = rlines.rstrip('\n').split()[0][1:] seq = fhandler.next() _ = fhandler.next() # lose qualities header but not needed qal = fhandler.next() # lose qualities but not needed # header now also contains original read return (rlines + ' ' + seq.strip() + ' ' + qal.strip(), seq.strip(), qal.strip()) def _get_map_read(line): header = line.split('\t', 1)[0] seq, qal = header.rsplit(' ', 2)[-2:] return header, seq, qal def _split_read_re(seq, qal, pattern, max_seq_len=None, site=''): """ Recursive generator that splits reads according to the predefined restriction enzyme. RE fragments yielded are followed by the RE site if a ligation site was found after the fragment. The RE site before the fragment is added outside this function """ try: pos = seq.index(pattern) if pos < min_seq_len: split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len) else: yield seq[:pos] + site, qal[:pos] + ('H' * len(site)) for subseq, subqal in split_read(seq[pos + len_relg:], qal[pos + len_relg:], pattern, max_seq_len): yield subseq, subqal except ValueError: if len(seq) == max_seq_len: raise ValueError if len(seq) > min_seq_len: yield seq, qal # Define function for stripping lines according to ficus if isinstance(trim, tuple): beg, end = trim beg -= 1 strip_line = lambda x: x[beg:end] else: strip_line = lambda x: x # define function to split reads according to restriction enzyme sites if isinstance(r_enz, str): enzyme = RESTRICTION_ENZYMES[r_enz].replace('|', '') enz_pattern = religated(r_enz) sub_enz_pattern = enz_pattern[:len(enz_pattern) / 2] len_relg = len(enz_pattern) print ' - splitting into restriction enzyme (RE) fragments using ligation sites' print ' - ligation sites are replaced by RE sites to match the reference genome' print ' * enzyme: %s, ligation site: %s, RE site: %s' % ( r_enz, enz_pattern, enzyme) split_read = _split_read_re else: enz_pattern = '' split_read = lambda x, y, z, after_z, after_after_z: (yield x, y) # function to yield reads from input file get_seq = _get_fastq_read if fastq else _get_map_read ## Start processing the input file if verbose: print 'Preparing %s file' % ('FASTQ' if fastq else 'MAP') if fastq: print ' - conversion to MAP format' if trim: print ' - triming reads %d-%d' % tuple(trim) # open input file fhandler = magic_open(fastq_path) # create output file out_name = out_fastq out = open(out_fastq, 'w') # iterate over reads and strip them site = '' if add_site else enzyme for header in fhandler: header, seq, qal = get_seq(header) # trim on wanted region of the read seq = strip_line(seq) qal = strip_line(qal) # get the generator of restriction enzyme fragments iter_frags = split_read(seq, qal, enz_pattern, len(seq), site) # the first fragment should not be preceded by the RE site try: seq, qal = iter_frags.next() except StopIteration: # read full of ligation events, fragments not reaching minimum continue except ValueError: # or not ligation site found, in which case we try with half # ligation site in case there was a sequencing error (half ligation # site is a RE site or nearly, and thus should not be found anyway) iter_frags = split_read(seq, qal, sub_enz_pattern, len(seq), site) try: seq, qal = iter_frags.next() except ValueError: continue except StopIteration: continue out.write(_map2fastq('\t'.join((header, seq, qal, '0', '-\n')))) # the next fragments should be preceded by the RE site # continue for seq, qal in iter_frags: out.write( _map2fastq('\t'.join((header, seq + site, qal + 'H' * (len(site)), '0', '-\n')))) out.close() return out_name