def build_subgraph(reads_in_subgraph, barcodes_unzipped): bc_file = open(barcodes_unzipped, 'rb') barcodes_iter = IO_utils.read_fastq_random( bc_file, offsets = reads_in_subgraph) subgraph_kmer_counts = Counter() while(True): try: barcode_data, _ = next(barcodes_iter) except StopIteration: break read_kmers = IO_utils.get_cyclic_kmers( barcode_data, int(args['kmer_size']), int(args['barcode_start']), int(args['barcode_end'])) for (kmer, _ ) in read_kmers: subgraph_kmer_counts[kmer] += 1 bc_file.close() edges = [] for(kmer, count) in subgraph_kmer_counts.items(): edge = Edge(kmer[0:-1], kmer[1:], count) edges.append(edge) subgraph = Graph(edges) return subgraph
def write_split_fastqs(params): import gzip (consensus_bcs, reads_assigned_db, reads_assigned_pipe, output_dir, reads_unzipped, barcodes_unzipped) = params split_dir = '%s/reads_split' % output_dir if not os.path.exists(split_dir): os.makedirs(split_dir) output_files = {'batch': '%s/batch.txt' % (split_dir)} batch_file = open(output_files['batch'], 'w') reads_per_cell = {} consensus_bcs.add('unassigned') for cell in consensus_bcs: try: cell_offsets = IO_utils.get_from_db(reads_assigned_pipe, [cell]) except IndexError: pass #cell_offsets = IO_utils.read_from_pickle(reads_assigned_pickled, cell) cell_name = 'cell_%s' % cell #initialie all readers and writers output_files[cell_name] = { 'reads': '%s/%s_reads.fastq.gz' % (split_dir, cell_name), 'barcodes': '%s/%s_barcodes.fastq.gz' % (split_dir, cell_name), 'umi': '%s/%s.umi.txt' % (split_dir, cell_name) } batch_file.write('%s\t%s\t%s\n' % \ (cell_name, output_files[cell_name]['umi'], output_files[cell_name]['reads'])) reads_writer = gzip.open(output_files[cell_name]['reads'], 'wb') barcodes_writer = gzip.open(output_files[cell_name]['barcodes'], 'wb') umi_writer = open(output_files[cell_name]['umi'], 'wb') reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') reads_iter = IO_utils.read_fastq_random( reads_f, offsets=[ cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 0 ]) barcodes_iter = IO_utils.read_fastq_random( barcodes_f, offsets=[ cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 1 ]) reads_in_cell = 0 while (True): try: reads_data, _ = next(reads_iter) barcodes_data, _ = next(barcodes_iter) reads_in_cell += 1 except StopIteration: break reads_data[0] += ' %s' % cell_name.replace('_', ':') reads_data[0] = reads_data[0].replace(' ', '_') barcodes_data[0] += ' %s' % cell_name.replace('_', ':') barcodes_data[0] = barcodes_data[0].replace(' ', '_') umi = barcodes_data[1][int(args['umi_start']):int(args['umi_end'])] reads_writer.write(('\n'.join(reads_data) + '\n').encode('utf-8')) barcodes_writer.write( ('\n'.join(barcodes_data) + '\n').encode('utf-8')) umi_writer.write((umi + '\n').encode('utf-8')) reads_writer.close() umi_writer.close() barcodes_writer.close() reads_f.close() barcodes_f.close() print('\tWrote %i reads to file:\t%s' % \ (reads_in_cell, cell_name)) reads_per_cell[cell] = reads_in_cell batch_file.close() return output_files, reads_per_cell