def get_allele_count_trajectories(pname, samplenames, fragment, use_PCR1=1, VERBOSE=0): '''Get allele counts for a single patient sample''' if VERBOSE >= 1: print 'Getting allele counts:', pname, fragment from hivwholeseq.patients.filenames import get_initial_reference_filename, \ get_allele_counts_filename refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fns = [] samplenames_out = [] for samplename_pat in samplenames: # PCR1 filter here fn1 = get_allele_counts_filename(pname, samplename_pat, fragment, PCR=1) fn2 = get_allele_counts_filename(pname, samplename_pat, fragment, PCR=2) if use_PCR1 == 0: for PCR, fn in enumerate((fn1, fn2), 1): if os.path.isfile(fn): fns.append(fn) samplenames_out.append((samplename_pat, PCR)) if VERBOSE >= 3: print samplename_pat, PCR elif use_PCR1 == 1: if os.path.isfile(fn1): fns.append(fn1) samplenames_out.append((samplename_pat, 1)) if VERBOSE >= 3: print samplename_pat, 1 elif os.path.isfile(fn2): fns.append(fn2) samplenames_out.append((samplename_pat, 2)) if VERBOSE >= 3: print samplename_pat, 2 elif use_PCR1 == 2: if os.path.isfile(fn1): fns.append(fn1) samplenames_out.append((samplename_pat, 1)) if VERBOSE >= 3: print samplename_pat, 1 act = np.zeros((len(fns), len(alpha), len(refseq)), int) for i, fn in enumerate(fns): # Average directly over read types? act[i] = np.load(fn).sum(axis=0) return (samplenames_out, act)
def get_allele_frequency_trajectories(pname, samples, fragment, qual_min=30, VERBOSE=0): '''Scan the reads of all samples and write to a single file''' if VERBOSE >= 1: print 'Getting allele frequency trajectories:', pname, fragment from hivwholeseq.patients.filenames import get_initial_reference_filename, \ get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \ get_allele_count_trajectories_filename from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \ get_allele_counts_insertions_from_file_unfiltered, \ filter_nus refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') # Prepare output data structures cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int) nus_traj = np.zeros((len(samples), len(alpha), len(refseq))) for it, sample in enumerate(samples): if VERBOSE >= 2: print pname, it, sample input_filename = get_mapped_to_initial_filename(pname, sample, fragment, type='bam') (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) # Take the total counts, blending in the read types cou = counts.sum(axis=0) cos_traj[it] = cou # Take the filtered frequencies, blending in the read types nu = filter_nus(counts) nus_traj[it] = nu #FIXME: test, etc. return (cos_traj, nus_traj)
def make_index_and_hash(pname, fragment, VERBOSE=0): '''Make index and hash files for reference''' # 1. Make genome index file stdout = sp.check_output([stampy_bin, '--overwrite', '--species="HIV fragment '+fragment+'"', '-G', get_initial_index_filename(pname, fragment, ext=False), get_initial_reference_filename(pname, fragment), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: '+pname+' '+fragment # 2. Build a hash file stdout = sp.check_output([stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-H', get_initial_hash_filename(pname, fragment, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: '+pname+' '+fragment
if VERBOSE >= 3: print 'fragments', fragments for fragment in fragments: inses = [] for samplename, sample in samples.iterrows(): if submit: fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min) continue if VERBOSE >= 1: print fragment, samplename sample = SamplePat(sample) pname = sample.patient refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR) if not os.path.isfile(fn): warn('No BAM file found', NoDataWarning) continue _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) inses.append(inse) if save_to_file: fn_out = sample.get_insertions_filename(fragment, PCR=PCR, qual_min=qual_min) save_insertions(fn_out, inse) if VERBOSE >= 2:
maxreads=maxreads, use_tests=use_tests) sys.exit() counts_all = [] for fragment in fragments: counts = [] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) pname = sample.patient if VERBOSE >= 2: print pname, fragment, samplename refseq = SeqIO.read( get_initial_reference_filename(pname, fragment), 'fasta') fn_out = sample.get_allele_cocounts_filename(fragment, PCR=PCR, qual_min=qual_min, compressed=True) fn = sample.get_mapped_filtered_filename( fragment, PCR=PCR, decontaminated=True) #FIXME if save_to_file: cocount = gac(fn, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE, qual_min=qual_min, use_tests=use_tests)
def get_reference_filename(self, fragment, format='fasta'): '''Get filename of the reference for mapping''' from hivwholeseq.patients.filenames import get_initial_reference_filename return get_initial_reference_filename(self.name, fragment, format)
print sample_seq.adapter cons_rec = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment), 'fasta') frag_spec = sample_seq.regions_complete[\ sample_seq.regions_generic.index(fragment)] # Complement PCR2 initial reference with tails from a later sample if int(sample_seq.PCR) == 2: (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=VERBOSE) conss = str(cons_rec.seq) output_filename = get_initial_reference_filename(pname, fragment) seq_in = SeqRecord(Seq(conss, unambiguous_dna), id='cons_init_p'+pname+'_'+frag_spec, name='cons_init_p'+pname+'_'+frag_spec, description='Initial consensus of patient '+pname+\ ', fragment '+frag_spec) # If absent, just copy the thing over if not os.path.isfile(output_filename): if VERBOSE >= 1: print pname+': initial consensus file created for sample', \ sample_seq.name, 'fragment', fragment SeqIO.write(seq_in, output_filename, 'fasta') # if present, check whether the sequences are the same (if so, no
if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient conss_genomewide = SeqIO.read( get_initial_reference_filename(pname, 'genomewide'), 'fasta') # Collect the allele counts (where possible) acs = [] for fragment in ['F' + str(i) for i in xrange(1, 7)]: try: ref = ''.join( SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')) ac = sample.get_allele_counts(fragment, merge_read_types=False) acs.append((fragment, ref, ac)) except IOError: continue if not len(acs): if VERBOSE >= 1:
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
samples = lssp() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples.index.tolist() for samplename, sample in samples.iterrows(): if VERBOSE >= 1: print samplename sample = SamplePat(sample) pname = sample.patient conss_genomewide = SeqIO.read(get_initial_reference_filename(pname, 'genomewide'), 'fasta') # Collect the allele counts (where possible) acs = [] for fragment in ['F'+str(i) for i in xrange(1, 7)]: try: ref = ''.join(SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')) ac = sample.get_allele_counts(fragment, merge_read_types=False) acs.append((fragment, ref, ac)) except IOError: continue if not len(acs): if VERBOSE >= 1: print 'No data found: skipping' continue
cons_rec = SeqIO.read( get_consensus_filename(data_folder, adaID, fragment), 'fasta') frag_spec = sample_seq.regions_complete[\ sample_seq.regions_generic.index(fragment)] # Complement PCR2 initial reference with tails from a later sample if int(sample_seq.PCR) == 2: (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=VERBOSE) conss = str(cons_rec.seq) output_filename = get_initial_reference_filename(pname, fragment) seq_in = SeqRecord(Seq(conss, unambiguous_dna), id='cons_init_p'+pname+'_'+frag_spec, name='cons_init_p'+pname+'_'+frag_spec, description='Initial consensus of patient '+pname+\ ', fragment '+frag_spec) # If absent, just copy the thing over if not os.path.isfile(output_filename): if VERBOSE >= 1: print pname+': initial consensus file created for sample', \ sample_seq.name, 'fragment', fragment SeqIO.write(seq_in, output_filename, 'fasta') # if present, check whether the sequences are the same (if so, no