def build_subgraph(reads_in_subgraph, barcodes_unzipped): bc_file = open(barcodes_unzipped, 'rb') barcodes_iter = IO_utils.read_fastq_random( bc_file, offsets = reads_in_subgraph) subgraph_kmer_counts = Counter() while(True): try: barcode_data, _ = next(barcodes_iter) except StopIteration: break read_kmers = IO_utils.get_cyclic_kmers( barcode_data, int(args['kmer_size']), int(args['barcode_start']), int(args['barcode_end'])) for (kmer, _ ) in read_kmers: subgraph_kmer_counts[kmer] += 1 bc_file.close() edges = [] for(kmer, count) in subgraph_kmer_counts.items(): edge = Edge(kmer[0:-1], kmer[1:], count) edges.append(edge) subgraph = Graph(edges) return subgraph
def run_all(cmdline_args): print('Splitting reads by barcodes') global args global output_files global output_dir args = cmdline_args output_dir = args['output_dir'] output_files['log'] = '%s/run_log.txt' % output_dir Logger.start(output_files['log']) start_time = time.time() reads_unzipped = args['reads'] barcodes_unzipped = args['barcodes'] print('Building kmer index') kmer_index, kmer_counts, subsamp_pearson = get_kmer_index( barcodes_unzipped) output_files['subsamp_pearson_plot'] = subsamp_pearson print('\t%i unique kmers indexed' % len(kmer_counts.items())) print('Finding cyclic paths in the barcode de Briujn graph') cyclic_paths = find_paths((kmer_index, kmer_counts, barcodes_unzipped, reads_unzipped, output_dir)) print('\t%i cyclic paths found' % len(cyclic_paths)) output_files['all_paths'] = IO_utils.save_paths_text(output_dir, cyclic_paths, prefix='all') print('Thresholding paths') (top_paths, fit_out) = threshold_paths(output_dir, cyclic_paths, args['num_cells']) output_files.update(fit_out) consensus_bcs = set([tup[0] for tup in top_paths]) print('Assigning reads by kmer compatability') reads_assigned_pickled = assign_all_reads( (consensus_bcs, reads_unzipped, barcodes_unzipped)) print('Splitting reads by cell') output_files['split'], reads_per_cell = write_split_fastqs( (consensus_bcs, reads_assigned_pickled, output_dir, reads_unzipped, barcodes_unzipped)) #delete temp pickle files for pickled in reads_assigned_pickled: os.unlink(pickled) #update paths list top_paths = update_paths_list(top_paths, reads_per_cell) output_files['thresholded_paths'] = IO_utils.save_paths_text( output_dir, top_paths, prefix='threshold') current_time = time.time() elapsed_time = current_time - start_time Logger.stop() return (output_files, elapsed_time)
def find_paths(params, starting_kmers=None): (kmer_index, kmer_counts, barcodes_unzipped, reads_unzipped, output_dir) = params barcode_length = args['barcode_end'] - args['barcode_start'] kmers_sorted = [ tup[0] for tup in sorted( list(kmer_counts.items()), key=lambda tup: tup[1], reverse=True) ] if (starting_kmers == None): starting_kmers = [] for kmer in kmers_sorted: if (kmer[0] == '$'): starting_kmers.append((kmer, kmer_index[kmer])) if (len(starting_kmers) >= args['breadth']): break else: starting_kmers_tmp = [] for kmer in starting_kmers: starting_kmers_tmp.append(kmer, kmer_index[kmer]) starting_kmers = starting_kmers_tmp pool = Pool(processes=args['threads']) paths = [] for group in IO_utils.grouper(starting_kmers, args['threads']): kmers_group = [tup[0] for tup in group] offsets_group = [tup[1] for tup in group] paths_group = pool.map( find_path_from_kmer, zip(kmers_group, offsets_group, repeat(barcodes_unzipped), repeat(barcode_length))) paths += [item for sublist in paths_group for item in sublist] pool.close() return paths
def assign_read_kmers(params): """ Assigns a single read to a cell barcode by kmer compatibility args (tuple) kmers_to_paths: dict of kmer -> list of paths that contain it min_kmer_size max_kmer_size read: list of fastq entry lines """ (kmer_map, min_kmer_size, max_kmer_size, (reads_data, reads_offset), (barcodes_data, barcodes_offset)) = params for kmer_size in range(max_kmer_size, min_kmer_size, -1): read_kmers = IO_utils.get_cyclic_kmers( barcodes_data, kmer_size, args['barcode_start'], args['barcode_end'], indel = True) bcs, is_assigned, is_unique = get_most_common_bc( kmer_map, read_kmers) if is_assigned and is_unique: return (bcs[0], reads_offset, barcodes_offset) #outherwise decrement kmer size and try again return ('unassigned', reads_offset, barcodes_offset)
def get_num_unassigned(simulation_output_dir): fq_fname = '%s/reads_split/cell_unassigned_barcodes.fastq.gz' % \ (simulation_output_dir) num_unassigned = 0 fq_file = gzip.open(fq_fname, 'rb') fq_iter = IO_utils.read_fastq_sequential(fq_file) for (lines, _) in fq_iter: num_unassigned += 1 return num_unassigned
def map_kmers_to_bcs_fixed_k(consensus_bcs, kmer_size): kmers_to_paths = {} for cell_barcode in consensus_bcs: kmers = IO_utils.get_cyclic_kmers( ['na', cell_barcode, 'na', cell_barcode], kmer_size, 0, len(cell_barcode), indel=True) for (kmer, _) in kmers: if (kmer not in kmers_to_paths.keys()): kmers_to_paths[kmer] = [] kmers_to_paths[kmer].append(cell_barcode) return kmers_to_paths
def index_read(params): """ Args params (tuple): barcodes_data (str): sequence of read_1 (barcode) barcodes_offset (int): line offset for this read Returns kmer_index (dict): """ (barcodes_data, barcodes_offset) = params kmer_index = {} read_kmers = IO_utils.get_cyclic_kmers(barcodes_data, args['kmer_size'], args['barcode_start'], args['barcode_end']) for (kmer, _) in read_kmers: if (kmer not in kmer_index.keys()): kmer_index[kmer] = [] kmer_index[kmer].append(barcodes_offset) return kmer_index
def get_fraction_consistent(pred_bc, simulation_output_dir): fq_fname = '%s/reads_split/cell_%s_barcodes.fastq.gz' % \ (simulation_output_dir, pred_bc) if not os.path.exists(fq_fname): return 0 fq_file = gzip.open(fq_fname, 'rb') fq_iter = IO_utils.read_fastq_sequential(fq_file) assignments = Counter() total_reads = 0 for (lines, _) in fq_iter: read_name = lines[0] assigned_bc = read_name.split(':')[-2].split('_')[0] assignments.update([assigned_bc]) total_reads += 1 try: common_bc, count = assignments.most_common()[0] return count / total_reads except IndexError: return 0
def get_fraction_correct_reads(pred_bc, simulation_output_dir): fq_fname = '%s/reads_split/cell_%s_barcodes.fastq.gz' % \ (simulation_output_dir, pred_bc) if not os.path.exists(fq_fname): return (0, 0) fq_file = gzip.open(fq_fname, 'rb') fq_iter = IO_utils.read_fastq_sequential(fq_file) tpr = 0. fpr = 0. for (lines, _) in fq_iter: read_name = lines[0] assigned_bc = read_name.split(':')[-1] true_bc = read_name.split(':')[-2].split('_')[0] if (assigned_bc == true_bc): tpr += 1. else: fpr += 1. total_reads = tpr + fpr tpr /= total_reads fpr /= total_reads return (tpr, fpr)
def run_all(args): print('\nInspecting and pre-processing inputs') if (args['output_dir'][-1] == '/'): args['output_dir'] = args['output_dir'][0:-1] if not os.path.exists(args['output_dir']): os.makedirs(args['output_dir']) if not os.path.exists(args['output_dir'] + '/plots'): os.makedirs(args['output_dir'] + '/plots') with (Path(__file__).parent / 'params.json').open() as r: kallisto = json.load(r)['kallisto'] assert kallisto split_args = {} check_pipeline_input(args, kallisto) if args['dropseq']: args['barcode_start'] = 0 args['barcode_end'] = 12 args['umi_start'] = 12 args['umi_end'] = 20 if args['kmer_size'] == None: args['kmer_size'] = 8 print('Unzipping files (temporary)') reads_unzipped = \ IO_utils.unzip(args['reads'].split(',')) barcodes_unzipped = \ IO_utils.unzip(args['barcodes'].split(',')) args['reads'] = reads_unzipped args['barcodes'] = barcodes_unzipped else: if args['kmer_size'] == None: args['kmer_size'] = 8 print('Unzipping (temporary)') reads_unzipped = \ IO_utils.unzip(args['reads'].split(',')) barcodes_unzipped = \ IO_utils.unzip(args['barcodes'].split(',')) args['reads'] = reads_unzipped args['barcodes'] = barcodes_unzipped """ elif args['10xgenomics']: args['barcode_start'] = 0 args['barcode_end'] = 26 args['umi_start'] = 26 args['umi_end'] = 34 if args['kmer_size'] == None: args['kmer_size'] = 20 print('Unzipping and merging files (temporary)') reads_unzipped = \ IO_utils.unzip(args['reads'].split(',')) barcodes_unzipped = IO_utils.merge_barcodefiles_10x( args['barcodes'].split(','), args['umis'].split(',')) args['reads'] = reads_unzipped args['barcodes'] = barcodes_unzipped """ check_split_input(args) output_files, elapsed_time = Split_reads.run_all(args) output_files['args'] = args print('Done idetifying barcodes and splitting reads.\n' + \ '\tTime elapsed: %0.002f seconds\n' % elapsed_time) #print(args['kallisto_idx']) if (args['kallisto_idx'] != None): print('Running kallisto') kallisto_dir = '%s/kallisto_outputs' % args['output_dir'] if not os.path.exists(kallisto_dir): os.makedirs(kallisto_dir) output_files['kallisto'] = run_kallisto(args, kallisto, kallisto_dir, output_files) print('Getting transcript compatibility counts') output_files['tcc'] = write_transcript_compatability_counts( args, output_files, kallisto_dir) print('Removing temp files') os.unlink(reads_unzipped) os.unlink(barcodes_unzipped) output_files['run_outputs'] = \ '%s/run_outputs.json' % args['output_dir'] with open(output_files['run_outputs'], 'w') as writer: writer.write(json.dumps(output_files, indent=3)) print('Done.') return output_files
def write_split_fastqs(params): import gzip (consensus_bcs, reads_assigned_db, reads_assigned_pipe, output_dir, reads_unzipped, barcodes_unzipped) = params split_dir = '%s/reads_split' % output_dir if not os.path.exists(split_dir): os.makedirs(split_dir) output_files = {'batch': '%s/batch.txt' % (split_dir)} batch_file = open(output_files['batch'], 'w') reads_per_cell = {} consensus_bcs.add('unassigned') for cell in consensus_bcs: try: cell_offsets = IO_utils.get_from_db(reads_assigned_pipe, [cell]) except IndexError: pass #cell_offsets = IO_utils.read_from_pickle(reads_assigned_pickled, cell) cell_name = 'cell_%s' % cell #initialie all readers and writers output_files[cell_name] = { 'reads': '%s/%s_reads.fastq.gz' % (split_dir, cell_name), 'barcodes': '%s/%s_barcodes.fastq.gz' % (split_dir, cell_name), 'umi': '%s/%s.umi.txt' % (split_dir, cell_name) } batch_file.write('%s\t%s\t%s\n' % \ (cell_name, output_files[cell_name]['umi'], output_files[cell_name]['reads'])) reads_writer = gzip.open(output_files[cell_name]['reads'], 'wb') barcodes_writer = gzip.open(output_files[cell_name]['barcodes'], 'wb') umi_writer = open(output_files[cell_name]['umi'], 'wb') reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') reads_iter = IO_utils.read_fastq_random( reads_f, offsets=[ cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 0 ]) barcodes_iter = IO_utils.read_fastq_random( barcodes_f, offsets=[ cell_offsets[i] for i in range(len(cell_offsets)) if i % 2 == 1 ]) reads_in_cell = 0 while (True): try: reads_data, _ = next(reads_iter) barcodes_data, _ = next(barcodes_iter) reads_in_cell += 1 except StopIteration: break reads_data[0] += ' %s' % cell_name.replace('_', ':') reads_data[0] = reads_data[0].replace(' ', '_') barcodes_data[0] += ' %s' % cell_name.replace('_', ':') barcodes_data[0] = barcodes_data[0].replace(' ', '_') umi = barcodes_data[1][int(args['umi_start']):int(args['umi_end'])] reads_writer.write(('\n'.join(reads_data) + '\n').encode('utf-8')) barcodes_writer.write( ('\n'.join(barcodes_data) + '\n').encode('utf-8')) umi_writer.write((umi + '\n').encode('utf-8')) reads_writer.close() umi_writer.close() barcodes_writer.close() reads_f.close() barcodes_f.close() print('\tWrote %i reads to file:\t%s' % \ (reads_in_cell, cell_name)) reads_per_cell[cell] = reads_in_cell batch_file.close() return output_files, reads_per_cell
def assign_all_reads(params): (consensus_bcs, reads_unzipped, barcodes_unzipped) = params BUFFER_SIZE = 100000 MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start'] MIN_KMER_SIZE = 6 reads_assigned_db, reads_assigned_pipe = IO_utils.initialize_redis_pipeline( ) pool = Pool(processes=args['threads']) #print('\tMapping kmers to consensus barcodes') if args['split_levenshtein']: print( '\tAssigning reads to consensus barcodes using Levenshtein distance' ) else: print( '\tAssigning reads to consensus barcodes using kmer compatability') kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE) read_count = 0 num_unassigned = 0 reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') encode = lambda i: str(i).encode('utf-8') encode_tup = lambda i, j: encode(i) + b',' + encode(j) for reads_chunk, barcodes_chunk in zip( IO_utils.get_read_chunks(reads_f, random=False, BUFFER_SIZE=BUFFER_SIZE), IO_utils.get_read_chunks(barcodes_f, random=False, BUFFER_SIZE=BUFFER_SIZE)): read_count += len(reads_chunk) if args['split_levenshtein']: assignments = pool.map( assign_read_levenshtein, zip(repeat(args), repeat(consensus_bcs), reads_chunk, barcodes_chunk)) else: assignments = pool.map( assign_read_kmers, zip(repeat(kmer_map), repeat(MIN_KMER_SIZE), repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk)) for (assignment, offset1, offset2) in assignments: if (assignment == 'unassigned'): num_unassigned += 1 #reads_assigned[assignment].append((offset1, offset2)) reads_assigned_pipe.rpush(assignment.encode('utf-8'), encode_tup(offset1, offset2)) reads_assigned_pipe.execute() print('\tProcessed %i reads' % read_count) reads_f.close() barcodes_f.close() pool.close() print('\t%i reads could not be assigned' % num_unassigned) #return pickle_files return reads_assigned_db, reads_assigned_pipe
def get_kmer_index(barcodes_unzipped): """ Args: barcodes_unzipped (str): filename for unzipped barcodes fq Returns kmer_idx (dict): map of kmer to list of line offsets for reads that contain that kmer kmer_counts (dict): map of kmer to absolute counts This method returns a kmer index and counts dict for a random subset of the dataset. The size of the subset attempts to be the minimal number of reads whose kmer spectrum is representative of the data General approach: initialize: get a random chunk of reads based on line offsets compute kmer counts loop: get a new chunk of reads and combine with prevoius chunks compute kmer counts for the new chunk compare kmer counts with previous iteration terminate when: pearsonR >= some cutoff value """ PEARSONR_CUTOFF = 0.999 MIN_ITERS = 10 BUFFER_SIZE = 10000 length = args['barcode_end'] - args['barcode_start'] pool = Pool(processes=args['threads']) read_count = 0 kmer_idx = {} counts_corr_coefs = [] num_reads = [] bc_file = open(barcodes_unzipped, 'rb') read_chunks_iter = IO_utils.get_read_chunks(bc_file, random=True, BUFFER_SIZE=BUFFER_SIZE) chunk_num = 0 while True: try: reads_chunk = next(read_chunks_iter) chunk_num += 1 except StopIteration: break read_count += len(reads_chunk) num_reads.append(read_count) chunk_kmer_indices = pool.map(index_read, reads_chunk) #chunk_kmer_indices is a list of dicts old_kmer_counts = get_kmer_counts(kmer_idx) #kmer counts before updating with chunk_kmer_indexes for element in chunk_kmer_indices: for (key, read_offsets) in element.items(): #read_offsets: [offset1, offset2, offset3 ...] if key not in kmer_idx: kmer_idx[key] = [] kmer_idx[key] = kmer_idx[key] + read_offsets del (chunk_kmer_indices) _ = gc.collect() new_kmer_counts = get_kmer_counts(kmer_idx) #check kmer count correlation counts_corr_coef = get_kmer_count_correlation(old_kmer_counts, new_kmer_counts) counts_corr_coefs.append(counts_corr_coef) print('\t%i reads indexed. Running pearsonr is %f' % \ (read_count, counts_corr_coef)) if(len(counts_corr_coefs) >= MIN_ITERS) and \ (counts_corr_coef > PEARSONR_CUTOFF): break bc_file.close() pool.close() return (kmer_idx, new_kmer_counts, Plot_utils.plot_kmer_subsamp_pearson(output_dir, counts_corr_coefs, num_reads))
def assign_all_reads(params): (consensus_bcs, reads_unzipped, barcodes_unzipped) = params BUFFER_SIZE = 10000 PICKLE_SIZE = 1000000 MAX_KMER_SIZE = args['barcode_end'] - args['barcode_start'] MIN_KMER_SIZE = 7 pool = Pool(processes=args['threads']) print('\tMapping kmers to consensus barcodes') kmer_map = map_kmers_to_bcs(consensus_bcs, MIN_KMER_SIZE, MAX_KMER_SIZE) reads_assigned = initialize_reads_assigned(consensus_bcs) print('\tAssigning reads to consensus barcodes') read_count = 0 num_unassigned = 0 reads_f = open(reads_unzipped, 'rb') barcodes_f = open(barcodes_unzipped, 'rb') pickle_files = [] for reads_chunk, barcodes_chunk in zip( IO_utils.get_read_chunks(reads_f, random=False, BUFFER_SIZE=BUFFER_SIZE), IO_utils.get_read_chunks(barcodes_f, random=False, BUFFER_SIZE=BUFFER_SIZE)): read_count += len(reads_chunk) if not args['split_levenshtein']: assignments = pool.map( assign_read_kmers, zip(repeat(kmer_map), repeat(MIN_KMER_SIZE), repeat(MAX_KMER_SIZE), reads_chunk, barcodes_chunk)) else: #this is a pipeline for reviwer expts only #works quite poorly, see simulation results assignments = pool.map( assign_read_levenshtein, zip(repeat(consensus_bcs), reads_chunk, barcodes_chunk)) for (assignment, offset1, offset2) in assignments: if (assignment == 'unassigned'): num_unassigned += 1 reads_assigned[assignment].append((offset1, offset2)) print('\tProcessed %i reads' % read_count) #pickle dump read assignments every 10m reads if read_count % PICKLE_SIZE == 0: pickle_files.append(IO_utils.write_to_pickle(reads_assigned)) reads_assigned = initialize_reads_assigned(consensus_bcs) pickle_files.append(IO_utils.write_to_pickle(reads_assigned)) reads_f.close() barcodes_f.close() pool.close() print('\t%i reads could not be assigned' % num_unassigned) return pickle_files