def test_write_to_files(data, tmpdir): import gzip import scipy io.write_to_files(pytest.sparse_matrix, pytest.top_cells, pytest.ordered_tags_map, pytest.data_type, tmpdir) file = tmpdir.join('umi_count/matrix.mtx.gz') with gzip.open(file, 'rb') as mtx_file: assert isinstance(scipy.io.mmread(mtx_file), scipy.sparse.coo.coo_matrix)
def main(): # Create logger and stream handler logger = logging.getLogger("cite_seq_count") logger.setLevel(logging.CRITICAL) ch = logging.StreamHandler() ch.setLevel(logging.CRITICAL) formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) ch.setFormatter(formatter) logger.addHandler(ch) start_time = time.time() parser = get_args() if not sys.argv[1:]: parser.print_help(file=sys.stderr) sys.exit(2) # Parse arguments. args = parser.parse_args() if args.whitelist: print("Loading whitelist") (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv( filename=args.whitelist, barcode_length=args.cb_last - args.cb_first + 1, collapsing_threshold=args.bc_threshold, ) else: whitelist = False # Load TAGs/ABs. ab_map = preprocessing.parse_tags_csv(args.tags) ab_map = preprocessing.check_tags(ab_map, args.max_error) # Identify input file(s) read1_paths, read2_paths = preprocessing.get_read_paths( args.read1_path, args.read2_path ) # preprocessing and processing occur in separate loops so the program can crash earlier if # one of the inputs is not valid. read1_lengths = [] read2_lengths = [] for read1_path, read2_path in zip(read1_paths, read2_paths): # Get reads length. So far, there is no validation for Read2. read1_lengths.append(preprocessing.get_read_length(read1_path)) read2_lengths.append(preprocessing.get_read_length(read2_path)) # Check Read1 length against CELL and UMI barcodes length. ( barcode_slice, umi_slice, barcode_umi_length, ) = preprocessing.check_barcodes_lengths( read1_lengths[-1], args.cb_first, args.cb_last, args.umi_first, args.umi_last, ) # Ensure all files have the same input length # if len(set(read1_lengths)) != 1: # sys.exit('Input barcode fastqs (read1) do not all have same length.\nExiting') # Initialize the counts dicts that will be generated from each input fastq pair final_results = defaultdict(lambda: defaultdict(Counter)) umis_per_cell = Counter() reads_per_cell = Counter() merged_no_match = Counter() number_of_samples = len(read1_paths) n_reads = 0 # Print a statement if multiple files are run. if number_of_samples != 1: print("Detected {} files to run on.".format(number_of_samples)) for read1_path, read2_path in zip(read1_paths, read2_paths): if args.first_n: n_lines = (args.first_n * 4) / number_of_samples else: n_lines = preprocessing.get_n_lines(read1_path) n_reads += int(n_lines / 4) n_threads = args.n_threads print("Started mapping") print("Processing {:,} reads".format(n_reads)) # Run with one process if n_threads <= 1 or n_reads < 1000001: print("CITE-seq-Count is running with one core.") (_final_results, _merged_no_match) = processing.map_reads( read1_path=read1_path, read2_path=read2_path, tags=ab_map, barcode_slice=barcode_slice, umi_slice=umi_slice, indexes=[0, n_reads], whitelist=whitelist, debug=args.debug, start_trim=args.start_trim, maximum_distance=args.max_error, sliding_window=args.sliding_window, ) print("Mapping done") _umis_per_cell = Counter() _reads_per_cell = Counter() for cell_barcode, counts in _final_results.items(): _umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts]) _reads_per_cell[cell_barcode] = sum( [sum(counts[UMI].values()) for UMI in counts] ) else: # Run with multiple processes print("CITE-seq-Count is running with {} cores.".format(n_threads)) p = Pool(processes=n_threads) chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads) parallel_results = [] for indexes in chunk_indexes: p.apply_async( processing.map_reads, args=( read1_path, read2_path, ab_map, barcode_slice, umi_slice, indexes, whitelist, args.debug, args.start_trim, args.max_error, args.sliding_window, ), callback=parallel_results.append, error_callback=sys.stderr, ) p.close() p.join() print("Mapping done") print("Merging results") ( _final_results, _umis_per_cell, _reads_per_cell, _merged_no_match, ) = processing.merge_results(parallel_results=parallel_results) del parallel_results # Update the overall counts dicts umis_per_cell.update(_umis_per_cell) reads_per_cell.update(_reads_per_cell) merged_no_match.update(_merged_no_match) for cell_barcode in _final_results: for tag in _final_results[cell_barcode]: if tag in final_results[cell_barcode]: # Counter + Counter = Counter final_results[cell_barcode][tag] += _final_results[cell_barcode][ tag ] else: # Explicitly save the counter to that tag final_results[cell_barcode][tag] = _final_results[cell_barcode][tag] ordered_tags_map = OrderedDict() for i, tag in enumerate(ab_map.values()): ordered_tags_map[tag] = i ordered_tags_map["unmapped"] = i + 1 # Correct cell barcodes if args.bc_threshold > 0: if len(umis_per_cell) <= args.expected_cells: print( "Number of expected cells, {}, is higher " "than number of cells found {}.\nNot performing" "cell barcode correction" "".format(args.expected_cells, len(umis_per_cell)) ) bcs_corrected = 0 else: print("Correcting cell barcodes") if not whitelist: ( final_results, umis_per_cell, bcs_corrected, ) = processing.correct_cells( final_results=final_results, reads_per_cell=reads_per_cell, umis_per_cell=umis_per_cell, expected_cells=args.expected_cells, collapsing_threshold=args.bc_threshold, ab_map=ordered_tags_map, ) else: ( final_results, umis_per_cell, bcs_corrected, ) = processing.correct_cells_whitelist( final_results=final_results, umis_per_cell=umis_per_cell, whitelist=whitelist, collapsing_threshold=args.bc_threshold, ab_map=ordered_tags_map, ) else: bcs_corrected = 0 # If given, use whitelist for top cells if whitelist: top_cells = whitelist # Add potential missing cell barcodes. for missing_cell in whitelist: if missing_cell in final_results: continue else: final_results[missing_cell] = dict() for TAG in ordered_tags_map: final_results[missing_cell][TAG] = Counter() top_cells.add(missing_cell) else: # Select top cells based on total umis per cell top_cells_tuple = umis_per_cell.most_common(args.expected_cells) top_cells = set([pair[0] for pair in top_cells_tuple]) # UMI correction if args.no_umi_correction: # Don't correct umis_corrected = 0 aberrant_cells = set() else: # Correct UMIS (final_results, umis_corrected, aberrant_cells) = processing.correct_umis( final_results=final_results, collapsing_threshold=args.umi_threshold, top_cells=top_cells, max_umis=20000, ) # Remove aberrant cells from the top cells for cell_barcode in aberrant_cells: top_cells.remove(cell_barcode) # Create sparse aberrant cells matrix (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices( final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=aberrant_cells, ) # Write uncorrected cells to dense output io.write_dense( sparse_matrix=umi_aberrant_matrix, index=list(ordered_tags_map.keys()), columns=aberrant_cells, outfolder=os.path.join(args.outfolder, "uncorrected_cells"), filename="dense_umis.tsv", ) # Create sparse matrices for results (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices( final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=top_cells, ) # Write umis to file io.write_to_files( sparse_matrix=umi_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type="umi", outfolder=args.outfolder, ) # Write reads to file io.write_to_files( sparse_matrix=read_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type="read", outfolder=args.outfolder, ) # Write unmapped sequences io.write_unmapped( merged_no_match=merged_no_match, top_unknowns=args.unknowns_top, outfolder=args.outfolder, filename=args.unmapped_file, ) # Create report and write it to disk create_report( n_reads=n_reads, reads_per_cell=reads_per_cell, no_match=merged_no_match, version=version, start_time=start_time, ordered_tags_map=ordered_tags_map, umis_corrected=umis_corrected, bcs_corrected=bcs_corrected, bad_cells=aberrant_cells, args=args, ) # Write dense matrix to disk if requested if args.dense: print("Writing dense format output") io.write_dense( sparse_matrix=umi_results_matrix, index=list(ordered_tags_map.keys()), columns=top_cells, outfolder=args.outfolder, filename="dense_umis.tsv", )
def main(): #Create logger and stream handler logger = logging.getLogger('cite_seq_count') logger.setLevel(logging.CRITICAL) ch = logging.StreamHandler() ch.setLevel(logging.CRITICAL) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) start_time = time.time() parser = get_args() if not sys.argv[1:]: parser.print_help(file=sys.stderr) sys.exit(2) # Parse arguments. args = parser.parse_args() if args.whitelist: (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv( filename=args.whitelist, barcode_length=args.cb_last - args.cb_first + 1, collapsing_threshold=args.bc_threshold) else: whitelist = False # Load TAGs/ABs. ab_map = preprocessing.parse_tags_csv(args.tags) ab_map = preprocessing.check_tags(ab_map, args.max_error) # Get reads length. So far, there is no validation for Read2. read1_length = preprocessing.get_read_length(args.read1_path) read2_length = preprocessing.get_read_length(args.read2_path) # Check Read1 length against CELL and UMI barcodes length. (barcode_slice, umi_slice, barcode_umi_length) = preprocessing.check_barcodes_lengths( read1_length, args.cb_first, args.cb_last, args.umi_first, args.umi_last) if args.first_n: n_lines = args.first_n * 4 else: n_lines = preprocessing.get_n_lines(args.read1_path) n_reads = int(n_lines / 4) n_threads = args.n_threads print('Started mapping') print('Processing {:,} reads'.format(n_reads)) #Run with one process if n_threads <= 1 or n_reads < 1000001: print('CITE-seq-Count is running with one core.') (final_results, merged_no_match) = processing.map_reads( read1_path=args.read1_path, read2_path=args.read2_path, tags=ab_map, barcode_slice=barcode_slice, umi_slice=umi_slice, indexes=[0, n_reads], whitelist=whitelist, debug=args.debug, start_trim=args.start_trim, maximum_distance=args.max_error, sliding_window=args.sliding_window) print('Mapping done') umis_per_cell = Counter() reads_per_cell = Counter() for cell_barcode, counts in final_results.items(): umis_per_cell[cell_barcode] = sum( [len(counts[UMI]) for UMI in counts]) reads_per_cell[cell_barcode] = sum( [sum(counts[UMI].values()) for UMI in counts]) else: # Run with multiple processes print('CITE-seq-Count is running with {} cores.'.format(n_threads)) p = Pool(processes=n_threads) chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads) parallel_results = [] for indexes in chunk_indexes: p.apply_async(processing.map_reads, args=(args.read1_path, args.read2_path, ab_map, barcode_slice, umi_slice, indexes, whitelist, args.debug, args.start_trim, args.max_error, args.sliding_window), callback=parallel_results.append, error_callback=sys.stderr) p.close() p.join() print('Mapping done') print('Merging results') (final_results, umis_per_cell, reads_per_cell, merged_no_match) = processing.merge_results( parallel_results=parallel_results) del (parallel_results) ordered_tags_map = OrderedDict() for i, tag in enumerate(ab_map.values()): ordered_tags_map[tag] = i ordered_tags_map['unmapped'] = i + 1 # Correct cell barcodes if (len(umis_per_cell) <= args.expected_cells): print("Number of expected cells, {}, is higher " \ "than number of cells found {}.\nNot performing" \ "cell barcode correction" \ "".format(args.expected_cells, len(umis_per_cell))) bcs_corrected = 0 else: print('Correcting cell barcodes') if not whitelist: (final_results, umis_per_cell, bcs_corrected) = processing.correct_cells( final_results=final_results, umis_per_cell=umis_per_cell, expected_cells=args.expected_cells, collapsing_threshold=args.bc_threshold) else: (final_results, umis_per_cell, bcs_corrected) = processing.correct_cells_whitelist( final_results=final_results, umis_per_cell=umis_per_cell, whitelist=whitelist, collapsing_threshold=args.bc_threshold) # Correct umi barcodes if not whitelist: top_cells_tuple = umis_per_cell.most_common(args.expected_cells) top_cells = set([pair[0] for pair in top_cells_tuple]) # Sort cells by number of mapped umis else: top_cells = whitelist # Add potential missing cell barcodes. for missing_cell in whitelist: if missing_cell in final_results: continue else: final_results[missing_cell] = dict() for TAG in ordered_tags_map: final_results[missing_cell][TAG] = Counter() top_cells.add(missing_cell) #If we want umi correction if not args.no_umi_correction: (final_results, umis_corrected, aberrant_cells) = processing.correct_umis( final_results=final_results, collapsing_threshold=args.umi_threshold, top_cells=top_cells, max_umis=20000) else: umis_corrected = 0 aberrant_cells = set() for cell_barcode in aberrant_cells: top_cells.remove(cell_barcode) #Create sparse aberrant cells matrix (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices( final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=aberrant_cells) #Write uncorrected cells to dense output io.write_dense(sparse_matrix=umi_aberrant_matrix, index=list(ordered_tags_map.keys()), columns=aberrant_cells, outfolder=os.path.join(args.outfolder, 'uncorrected_cells'), filename='dense_umis.tsv') (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices( final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=top_cells) # Write umis to file io.write_to_files(sparse_matrix=umi_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type='umi', outfolder=args.outfolder) # Write reads to file io.write_to_files(sparse_matrix=read_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type='read', outfolder=args.outfolder) top_unmapped = merged_no_match.most_common(args.unknowns_top) with open(os.path.join(args.outfolder, args.unmapped_file), 'w') as unknown_file: unknown_file.write('tag,count\n') for element in top_unmapped: unknown_file.write('{},{}\n'.format(element[0], element[1])) create_report(n_reads=n_reads, reads_per_cell=reads_per_cell, no_match=merged_no_match, version=version, start_time=start_time, ordered_tags_map=ordered_tags_map, umis_corrected=umis_corrected, bcs_corrected=bcs_corrected, bad_cells=aberrant_cells, args=args) if args.dense: print('Writing dense format output') io.write_dense(sparse_matrix=umi_results_matrix, index=list(ordered_tags_map.keys()), columns=top_cells, outfolder=args.outfolder, filename='dense_umis.tsv')
def main(): start_time = time.time() parser = get_args() if not sys.argv[1:]: parser.print_help(file=sys.stderr) sys.exit(2) # Parse arguments. args = parser.parse_args() if args.whitelist: whitelist = preprocessing.parse_whitelist_csv( args.whitelist, args.cb_last - args.cb_first + 1) else: whitelist = None # Load TAGs/ABs. ab_map = preprocessing.parse_tags_csv(args.tags) ab_map = preprocessing.check_tags(ab_map, args.max_error) # Get reads length. So far, there is no validation for Read2. read1_length = preprocessing.get_read_length(args.read1_path) read2_length = preprocessing.get_read_length(args.read2_path) # Check Read1 length against CELL and UMI barcodes length. (barcode_slice, umi_slice, barcode_umi_length) = preprocessing.check_barcodes_lengths( read1_length, args.cb_first, args.cb_last, args.umi_first, args.umi_last) if args.first_n: n_lines = args.first_n * 4 else: n_lines = preprocessing.get_n_lines(args.read1_path) n_reads = int(n_lines / 4) n_threads = args.n_threads print('Started mapping') #Run with one process if n_threads <= 1 or n_reads < 1000001: print('CITE-seq-Count is running with one core.') (final_results, merged_no_match) = processing.map_reads( read1_path=args.read1_path, read2_path=args.read2_path, tags=ab_map, barcode_slice=barcode_slice, umi_slice=umi_slice, indexes=[0, n_reads], whitelist=whitelist, debug=args.debug, start_trim=args.start_trim, maximum_distance=args.max_error) print('Mapping done') umis_per_cell = Counter() reads_per_cell = Counter() for cell_barcode, counts in final_results.items(): umis_per_cell[cell_barcode] = sum( [len(counts[UMI]) for UMI in counts if UMI != 'unmapped']) reads_per_cell[cell_barcode] = sum([ sum(counts[UMI].values()) for UMI in counts if UMI != 'unmapped' ]) else: # Run with multiple processes print('CITE-seq-Count is running with {} cores.'.format(n_threads)) p = Pool(processes=n_threads) chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads) parallel_results = [] for indexes in chunk_indexes: p.apply_async(processing.map_reads, args=(args.read1_path, args.read2_path, ab_map, barcode_slice, umi_slice, indexes, whitelist, args.debug, args.start_trim, args.max_error), callback=parallel_results.append, error_callback=sys.stderr) p.close() p.join() print('Mapping done') print('Merging results') (final_results, umis_per_cell, reads_per_cell, merged_no_match) = processing.merge_results( parallel_results=parallel_results) del (parallel_results) # Correct cell barcodes (final_results, umis_per_cell, bcs_corrected) = processing.correct_cells( final_results=final_results, umis_per_cell=umis_per_cell, expected_cells=args.expected_cells, collapsing_threshold=args.bc_threshold) # Correct umi barcodes (final_results, umis_corrected) = processing.correct_umis( final_results=final_results, collapsing_threshold=args.umi_threshold) ordered_tags_map = OrderedDict() for i, tag in enumerate(ab_map.values()): ordered_tags_map[tag] = i ordered_tags_map['unmapped'] = i + 1 # Sort cells by number of mapped umis if not whitelist: top_cells_tuple = umis_per_cell.most_common(args.expected_cells) top_cells = set([pair[0] for pair in top_cells_tuple]) else: top_cells = whitelist # Add potential missing cell barcodes. for missing_cell in whitelist: if missing_cell in final_results: continue else: final_results[missing_cell] = dict() for TAG in ordered_tags_map: final_results[missing_cell][TAG] = 0 top_cells.add(missing_cell) (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices( final_results=final_results, ordered_tags_map=ordered_tags_map, top_cells=top_cells) io.write_to_files(sparse_matrix=umi_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type='umi', outfolder=args.outfolder) io.write_to_files(sparse_matrix=read_results_matrix, top_cells=top_cells, ordered_tags_map=ordered_tags_map, data_type='read', outfolder=args.outfolder) top_unmapped = merged_no_match.most_common(args.unknowns_top) with open(os.path.join(args.outfolder, args.unmapped_file), 'w') as unknown_file: unknown_file.write('tag,count\n') for element in top_unmapped: unknown_file.write('{},{}\n'.format(element[0], element[1])) create_report(n_reads=n_reads, reads_per_cell=reads_per_cell, no_match=merged_no_match, version=version, start_time=start_time, ordered_tags_map=ordered_tags_map, umis_corrected=umis_corrected, bcs_corrected=bcs_corrected, args=args) if args.dense: print('Writing dense format output') io.write_dense(sparse_matrix=umi_results_matrix, index=list(ordered_tags_map.keys()), columns=top_cells, file_path=os.path.join(args.outfolder, 'dense_umis.tsv'))