def build_read_kmers_index(reads_filename, kmer_ii, kmer_size): """ Return two dictionaries. One contains the counts of ambiguous k-mers, while the other contains the number of unique k-mers that map to a given contig. """ ambiguous_kmer_counts = defaultdict(int) contig_counts = defaultdict(int) pf = SeqIO.ParseFastQ(reads_filename) #tuple = pf.getNextReadSeq() kmer = None contig = None contigs_containing_kmer = [] unalignable_kmers = 0 #total_abundance = 0 num_reads = 0 sum = 0 for tuple in pf: # For each k-mer in the read... for i in xrange(0, len(tuple[1]) - kmer_size + 1): # ... find what contigs contain it. kmer = tuple[1][i:i + kmer_size] if kmer in kmer_ii or revcompl(kmer) in kmer_ii: if kmer not in kmer_ii: kmer = revcompl(kmer) # and randomly assign the count to one of the items. contigs_containing_kmer = accumulate(kmer_ii[kmer]) #print kmer +'\t', contigs_containing_kmer = list(contigs_containing_kmer) if len(contigs_containing_kmer) > 1: ambiguous_kmer_counts[kmer] += 1 else: contig_counts[contigs_containing_kmer[0][0]] += 1 else: unalignable_kmers += 1 if num_reads % 100000 == 0: sys.stderr.write('Processed reads:\t' + str(num_reads) + '\r') sum += len(tuple[1]) num_reads += 1 return ambiguous_kmer_counts, contig_counts, sum / num_reads
def assign_read_kmers_to_contigs(reads_filename, kmer_ii, kmer_size): """ Given a set of reads and k-mer length, assign k-mer counts to the contigs. """ contig_counts = defaultdict(int) pf = SeqIO.ParseFastQ(reads_filename) #tuple = pf.getNextReadSeq() kmer = None contig = None unalignable_kmers = 0 num_reads = 0 sum = 0 for tuple in pf: #while tuple is not None: # For each k-mer in the read... for i in xrange(0, len(tuple[1]) - kmer_size + 1): # ... find what contigs contain it. kmer = tuple[1][i:i + kmer_size] if kmer in kmer_ii: # and randomly assign the count to one of the items. contig = random.choice(kmer_ii[kmer])[0] contig_counts[contig] += 1 elif revcompl(kmer) in kmer_ii: contig = random.choice(kmer_ii[revcompl(kmer)])[0] contig_counts[contig] += 1 else: unalignable_kmers += 1 sum += len(tuple[1]) num_reads += 1 # tuple = pf.getNextReadSeq() #print 'Unalignable k-mers:\t' + str(unalignable_kmers) return contig_counts, sum / num_reads
def assign_read_kmers_to_contigs_iterative(reads_filename, kmer_ii, kmer_size, contig_abundances): """ Given a set of reads and k-mer length, assign k-mer counts to the contigs based on their abundances. """ contig_counts = defaultdict(int) pf = SeqIO.ParseFastQ(reads_filename) #tuple = pf.getNextReadSeq() kmer = None contig = None contigs_containing_kmer = [] unalignable_kmers = 0 total_abundance = 0 num_reads = 0 sum = 0 for tuple in pf: #while tuple is not None: # For each k-mer in the read... for i in xrange(0, len(tuple[1]) - kmer_size + 1): # ... find what contigs contain it. kmer = tuple[1][i:i + kmer_size] if kmer in kmer_ii or revcompl(kmer) in kmer_ii: if kmer not in kmer_ii: kmer = revcompl(kmer) # and randomly assign the count to one of the items. contigs_containing_kmer = accumulate(kmer_ii[kmer]) #print kmer +'\t', contigs_containing_kmer = list(contigs_containing_kmer) #print contigs_containing_kmer # Calculate total abundance for contig in contigs_containing_kmer: total_abundance += contig_abundances[contig[0]] # Choose choice = random.randint(1, total_abundance) curr_abundance = 0 chosen_contig_tuple = None for contig in contigs_containing_kmer: curr_abundance += contig_abundances[contig[0]] # Have we found the right contig? if curr_abundance >= choice: chosen_contig_tuple = contig #print 'Selecting:\t', #print chosen_contig_tuple break contig_counts[chosen_contig_tuple[0]] += 1 total_abundance = 0 else: unalignable_kmers += 1 sum += len(tuple[1]) num_reads += 1 return contig_counts, sum / num_reads
def main(): if len(sys.argv) < 1: print USAGE sys.exit() parser = OptionParser() parser.add_option("-n", "--num_trials", dest="num_trials", default="1000") parser.add_option("-s", "--sample_size", dest="sample_size", default="10000") parser.add_option("-i", "--input", dest="input", default=None) parser.add_option("-1", "--1", dest="first_mates") parser.add_option("-2", "--2", dest="second_mates") parser.add_option("-k", "--samples", dest="samples", default=0) parser.add_option("-o", "--output_dir", dest="output_dir", default="./") parser.add_option("-t", "--trials", dest="trials", default=0) parser.add_option("-d", "--debug_level", dest="debug_level", default=0) parser.set_usage(USAGE) (options, args) = parser.parse_args(sys.argv[1:]) debug_level = int(options.debug_level) # Read through each reads, and add their respective input_number to sample_set. # [1 1 1 1 2 2 2 2 2 ... 6 6 6] # This way we can choose how many reads of what input file we should have based # on their abundances. # TODO(cmhill): Inefficient, but works fine for 100 million reads. total_read_set = [] # We have to process the mates together in order. first_mate_files = options.first_mates.split(',') second_mate_files = options.second_mates.split(',') if len(first_mate_files) != len(second_mate_files): print "Error: Mate files need to have the same number." sys.exit(0) # Handle the option of multiple samples. for samples in options.samples.split(','): samples = int(samples) output_dir = options.output_dir + '/' + str(samples) + '/' if not os.path.exists(output_dir): os.makedirs(output_dir) # Re-open all read files. first_mate_readers = [] second_mate_readers = [] for i in range(len(first_mate_files)): first_mate_readers.append(SeqIO.ParseFastQ(first_mate_files[i])) second_mate_readers.append(SeqIO.ParseFastQ(second_mate_files[i])) sample_reads_dict = {} sample_reads = [] k = samples index = 0 file_index = 0 while file_index < len(first_mate_readers): second_mate = second_mate_readers[file_index].next() for first_mate in first_mate_readers[file_index]: index += 1 # Reserviour sampling algorithm. if len(sample_reads) < k: sample_reads.append( (file_index, (first_mate, second_mate))) else: r = random.randrange(index) if r < k: sample_reads[r] = ((file_index, (first_mate, second_mate))) try: second_mate = second_mate_readers[file_index].next() except: pass if debug_level > 0: print 'File Index: ' + str(file_index) print 'Reads needed: ' + str(k) print sample_reads file_index += 1 # TODO(cmhill): Remove, since we print the reads out right away. sample_reads_dict[file_index] = sample_reads file_index = 0 # Write out these sample reads to file. # Re-open all read files. first_mate_writers = [] second_mate_writers = [] for i in range(len(first_mate_files)): first_mate_writers.append( open(output_dir + '/' + str(file_index) + '_1.fastq', 'w')) second_mate_writers.append( open(output_dir + '/' + str(file_index) + '_2.fastq', 'w')) file_index += 1 for reads in sample_reads: first_mate_writers[reads[0]].write('\n'.join(reads[1][0]) + '\n') second_mate_writers[reads[0]].write('\n'.join(reads[1][1]) + '\n')