def make_reference(data_folder, adaID, fragments, refname, VERBOSE=0, summary=True): '''Make reference sequence trimmed to the necessary parts''' from hivwholeseq.reference import load_custom_reference seq = load_custom_reference(refname) output_filename = get_reference_premap_filename(data_folder, adaID) if fragments is None: seq_trim = seq else: # Look for the first fwd and the last rev primers to trim the reference # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)! # If more than one primer is used for the first or last fragment, take the # longest reference from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2 if '+' in fragments[0]: fragment_subs = [ fragments[0][:2] + fsub + fragments[0][-1] for fsub in fragments[0][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs ] fragments[0] = fragment_subs[np.argmin(fr_pos_subs)] pr_fwd = primers_PCR[fragments[0]][0] if '+' in fragments[-1]: fragment_subs = [ fragments[-1][:2] + fsub + fragments[-1][-1] for fsub in fragments[-1][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs ] fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)] pr_rev = primers_PCR[fragments[-1]][1] smat = np.array(seq) # Get all possible primers from ambiguous nucleotides and get the best match from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1') n_matches_fwd = [ (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max() for i in xrange(len(seq) - len(pr_fwd)) ] pr_fwd_pos = np.argmax(n_matches_fwd) pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1') n_matches_rev = [ (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max() for i in xrange(pr_fwd_pos + len(pr_fwd), len(seq) - len(pr_rev)) ] # Here you come from the right, i.e. look in the 3' LTR first pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax( n_matches_rev[::-1]) output = [['Reference name:', refname]] output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd]) output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev]) output = '\n'.join(map(' '.join, output)) if VERBOSE: print output if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write(output) f.write('\n') # The reference includes both the first fwd primer and the last rev one seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)] seq_trim.id = '_'.join( [seq_trim.id, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev))]) seq_trim.name = '_'.join([ seq_trim.name, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev)) ]) seq_trim.description = ' '.join([ seq_trim.description, 'from', str(pr_fwd_pos + 1), 'to', str(pr_rev_pos + len(pr_rev)), '(indices from 1, extremes included)' ]) SeqIO.write(seq_trim, output_filename, 'fasta') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('Reference sequence written to: ' + output_filename) f.write('\n')
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments, maxreads=-1, VERBOSE=0, minisize=100, include_tests=False, summary=True): '''Trim reads and divide them into fragments''' if VERBOSE: print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\ ' '.join(fragments) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Fragments used: '+' '.join(fragments)+'\n') ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') smat = np.array(refseq, 'S1') len_reference = len(refseq) # Get the positions of fragment start/end, w/ and w/o primers frags_pos = get_fragment_positions(smat, fragments) store_reference_fragmented(data_folder, adaID, refseq, dict(zip(fragments, frags_pos['trim']))) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Primer positions (for fragments):\n') for (fragment, poss_full, poss_trim) in izip(fragments, frags_pos['full'], frags_pos['trim']): f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\ ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n') write_fragment_positions(data_folder, adaID, fragments, frags_pos) # Get the positions of the unwanted outer primers (in case we DO nested PCR # for that fragment) # NOTE: the LTRs make no problem, because the rev outer primer of F6 # is not in the reference anymore if F6 has undergone nested PCR # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting from re import findall primers_out = {'fwd': [], 'rev': []} for i, fr in enumerate(fragments): if (i != 0) and findall(r'F[2-6][a-z]?i', fr): primers_out['fwd'].append(fr[:-1]+'o') if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr): primers_out['rev'].append(fr[:-1]+'o') # Get all possible unambiguous primers for the unwanted outer primers from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])), 'S1', ndmin=2) for fr in primers_out['fwd']], 'rev': [np.array(map(list, eas(primers_PCR[fr][1])), 'S1', ndmin=2) for fr in primers_out['rev']], } primers_out_pos = {'fwd': [], 'rev': []} if primers_out['fwd']: primers_out_pos['fwd'] = map(itemgetter(0), get_primer_positions(smat, primers_out['fwd'], 'fwd')) if primers_out['rev']: primers_out_pos['rev'] = map(itemgetter(1), get_primer_positions(smat, primers_out['rev'], 'rev')) # Input and output files input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): convert_sam_to_bam(input_filename) output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: try: file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile) for ofn in output_filenames[:len(fragments)]] fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile) fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile) fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile) fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile) # Iterate over the mapped reads and assign fragments n_mapped = [0 for fragment in fragments] n_unmapped = 0 n_crossfrag = 0 n_ambiguous = 0 n_outer = 0 n_lowq = 0 for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 i_fwd = reads[0].is_reverse # If unmapped or unpaired, mini, or insert size mini, or # divergent read pair (fully cross-overlapping), discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair) or \ (reads[0].rlen < 50) or (reads[1].rlen < 50) or \ (reads[i_fwd].isize < minisize): if VERBOSE >= 3: print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname n_unmapped += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # If the insert is a misamplification from the outer primers # in fragments that underwent nested PCR, # trash it (it will have skewed amplification anyway). We cannot # find all of those, rather only the ones still carrying the # primer itself (some others have lost it while shearing). For # those, no matter what happens at the end (reading into adapters, # etc.), ONE of the reads in the pair will start exactly with one # outer primer: if the rev read with a rev primer, if the fwd # with a fwd one. Test all six. if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \ test_outer_primer(reads, primers_out_pos, primers_out_seq, len_reference): if VERBOSE >= 3: print 'Read pair from outer primer:', reads[0].qname n_outer += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # FIXME: the following becomes a bit harder when we mix parallel # PCRs, e.g. F5a+b, to get more product # Assign to a fragment now, so that primer trimming is faster pair_identity = assign_to_fragment(reads, frags_pos['full'], VERBOSE=VERBOSE) # 1. If no fragments are possible (e.g. one read crosses the # fragment boundary, they map to different fragments), dump it # into a special bucket if pair_identity == 'cross': n_crossfrag += 1 fo_cm.write(reads[0]) fo_cm.write(reads[1]) continue # 2. If 2+ fragments are possible (tie), put into a special bucket # (essentially excluded, because we want two independent measurements # in the overlapping region, but we might want to recover them) elif pair_identity == 'ambiguous': n_ambiguous += 1 fo_am.write(reads[0]) fo_am.write(reads[1]) continue # 3. If the intersection is a single fragment, good: trim the primers # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0 n_frag = int(pair_identity) frag_pos = frags_pos['trim'][n_frag] if not np.isscalar(frag_pos[0]): frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']] trashed_primers = trim_primers(reads, frag_pos, include_tests=include_tests) if trashed_primers or (reads[i_fwd].isize < 100): n_unmapped += 1 if VERBOSE >= 3: print 'Read pair is mismapped:', reads[0].qname fo_um.write(reads[0]) fo_um.write(reads[1]) continue # Quality trimming: if no decently long pair survives, trash #trashed_quality = main_block_low_quality(reads, phred_min=20, # include_tests=include_tests) trashed_quality = trim_low_quality(reads, phred_min=20, include_tests=include_tests) if trashed_quality or (reads[i_fwd].isize < 100): n_lowq += 1 if VERBOSE >= 3: print 'Read pair has low phred quality:', reads[0].qname fo_lq.write(reads[0]) fo_lq.write(reads[1]) continue # Check for cross-overhangs or COH (reading into the adapters) # ---------------> # <----------- # In that case, trim to perfect overlap. if test_coh(reads, VERBOSE=False): trim_coh(reads, trim=0, include_tests=include_tests) # Change coordinates into the fragmented reference (primer-trimmed) for read in reads: read.pos -= frag_pos[0] read.mpos -= frag_pos[0] # Here the tests if include_tests: lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0] if test_sanity(reads, n_frag, lfr): print 'Tests failed:', reads[0].qname import ipdb; ipdb.set_trace() # There we go! n_mapped[n_frag] += 1 file_handles[n_frag].write(reads[0]) file_handles[n_frag].write(reads[1]) finally: for f in file_handles: f.close() fo_am.close() fo_cm.close() fo_um.close() fo_lq.close() if VERBOSE: print 'Trim and divide results: adaID '+adaID print 'Total:\t\t', irp print 'Mapped:\t\t', sum(n_mapped), n_mapped print 'Unmapped/unpaired/tiny:\t', n_unmapped print 'Outer primer\t', n_outer print 'Crossfrag:\t', n_crossfrag print 'Ambiguous:\t', n_ambiguous print 'Low-quality:\t', n_lowq # Write summary to file if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim and divide results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp + 1)+'\n') f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n') f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n') f.write('Outer primer\t'+str(n_outer)+'\n') f.write('Crossfrag:\t'+str(n_crossfrag)+'\n') f.write('Ambiguous:\t'+str(n_ambiguous)+'\n') f.write('Low-quality:\t'+str(n_lowq)+'\n')
def get_primer_positions(smat, fragments, type='both'): '''Get the primer positions for fwd, rev, or both primers''' from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas # j controls the direction: j = 0 --> FWD, j = 1 --> REV types = {'fwd': [0], 'rev': [1], 'both': [0, 1]} js = types[type] primer_poss = [[], []] for j in js: pr_old_pos = 0 pr_old = '' for ifr, fragment in enumerate(fragments): # Sometimes, the PCR is performed with different primer sets in parallel # to get more DNA, so we must take the innermost to be conservative (we # lose some reads, but the ones we keep are good). # NOTE: F6 is not ambiguous, so we do not treat it specially here (see below) if '+' in fragment: if fragment[-1] == 'o': pcs = pcos elif fragment[-1] == 'i': pcs = pcis else: raise ValueError('Neither PCR1 nor PCR2??') fragment_subs = [fragment[:2]+fsub for fsub in fragment[2:-1].split('+')] pco_inn = [pcs[fsub][j][not j] for fsub in fragment_subs] if not j: fragment_inn = fragment_subs[np.argmax(pco_inn)]+fragment[-1] fragment_out = fragment_subs[np.argmin(pco_inn)]+fragment[-1] else: fragment_inn = fragment_subs[np.argmin(pco_inn)]+fragment[-1] fragment_out = fragment_subs[np.argmax(pco_inn)]+fragment[-1] if VERBOSE >= 3: print j, fragment_subs, pco_inn, fragment_inn, fragment_out pr_pos_pair = {} # Get the left first if not j: fragment = fragment_out label = 'outer' else: fragment = fragment_inn label = 'inner' # Expand ambiguous primers in a list of all possible unambiguous ones, # and look for the best approximate match between all primers and a # sliding window in the HIV genome pr = primers_PCR[fragment][j] pr_mat = np.array(map(list, eas(pr)), 'S1') n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max() for i in xrange(pr_old_pos + len(pr_old), len(smat) - len(pr) + 1)] pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches) pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)] # Get the right one second if not j: fragment = fragment_inn label = 'inner' else: fragment = fragment_out label = 'outer' pr = primers_PCR[fragment][j] pr_mat = np.array(map(list, eas(pr)), 'S1') n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max() for i in xrange(pr_pos, len(smat) - len(pr) + 1)] pr_pos = pr_pos + np.argmax(n_matches) pr_pos_pair[label] = [pr_pos, pr_pos + len(pr)] else: # Expand ambiguous primers in a list of all possible unambiguous ones, # and look for the best approximate match between all primers and a # sliding window in the HIV genome pr = primers_PCR[fragment][j] pr_mat = np.array(map(list, eas(pr)), 'S1') n_matches = [(smat[i: i + len(pr)] == pr_mat).sum(axis=1).max() for i in xrange(pr_old_pos + len(pr_old), len(smat) - len(pr) + 1)] # NOTE: F6 rev lies in the LTR, so we risk reading the other LTR. # Treat it as a special case: come from the right! if j and ('F6' in fragment): pr_pos = len(smat) - len(pr) - np.argmax(n_matches[::-1]) else: pr_pos = pr_old_pos + len(pr_old) + np.argmax(n_matches) pr_pos_pair = [pr_pos, pr_pos + len(pr)] primer_poss[j].append(pr_pos_pair) pr_old_pos = pr_pos pr_old = pr if type != 'both': return primer_poss[type == 'rev'] else: return primer_poss