def create_report(n_reads, reads_per_cell, no_match, version, start_time, ordered_tags_map, umis_corrected, bcs_corrected, bad_cells, args): """ Creates a report with details about the run in a yaml format. Args: n_reads (int): Number of reads that have been processed. reads_matrix (scipy.sparse.dok_matrix): A sparse matrix continining read counts. no_match (Counter): Counter of unmapped tags. version (string): CITE-seq-Count package version. start_time (time): Start time of the run. args (arg_parse): Arguments provided by the user. """ total_unmapped = sum(no_match.values()) total_mapped = sum(reads_per_cell.values()) - total_unmapped mapped_perc = round((total_mapped / n_reads) * 100) unmapped_perc = round((total_unmapped / n_reads) * 100) with open(os.path.join(args.outfolder, 'run_report.yaml'), 'w') as report_file: report_file.write("""Date: {} Running time: {} CITE-seq-Count Version: {} Reads processed: {} Percentage mapped: {} Percentage unmapped: {} Uncorrected cells: {} Correction: \tCell barcodes collapsing threshold: {} \tCell barcodes corrected: {} \tUMI collapsing threshold: {} \tUMIs corrected: {} Run parameters: \tRead1_filename: {} \tRead2_filename: {} \tCell barcode: \t\tFirst position: {} \t\tLast position: {} \tUMI barcode: \t\tFirst position: {} \t\tLast position: {} \tExpected cells: {} \tTags max errors: {} \tStart trim: {} """.format(datetime.datetime.today().strftime('%Y-%m-%d'), secondsToText.secondsToText(time.time() - start_time), version, n_reads, mapped_perc, unmapped_perc, len(bad_cells), args.bc_threshold, bcs_corrected, args.umi_threshold, umis_corrected, args.read1_path, args.read2_path, args.cb_first, args.cb_last, args.umi_first, args.umi_last, args.expected_cells, args.max_error, args.start_trim))
def map_reads(read1_path, read2_path, tags, barcode_slice, umi_slice, indexes, whitelist, debug, start_trim, maximum_distance): """Read through R1/R2 files and generate a islice starting at a specific index. It reads both Read1 and Read2 files, creating a dict based on cell barcode. Args: read1_path (string): Path to R1.fastq.gz read2_path (string): Path to R2.fastq.gz chunk_size (int): The number of lines to process tags (dict): A dictionary with the TAGs + TAG Names. barcode_slice (slice): A slice for extracting the Barcode portion from the sequence. umi_slice (slice): A slice for extracting the UMI portion from the sequence. indexes (list): Pair of first and last index for islice whitelist (set): The set of white-listed barcodes. debug (bool): Print debug messages. Default is False. start_trim (int): Number of bases to trim at the start. maximum_distance (int): Maximum distance given by the user. Returns: results (dict): A dict of dict of Counters with the mapping results. no_match (Counter): A counter with unmapped sequences. """ # Initiate values results = {} no_match = Counter() n = 1 t = time.time() with gzip.open(read1_path, 'rt') as textfile1, \ gzip.open(read2_path, 'rt') as textfile2: # Read all 2nd lines from 4 line chunks. If first_n not None read only 4 times the given amount. secondlines = islice(zip(textfile1, textfile2), indexes[0] * 4 + 1, indexes[1] * 4 + 1, 4) for read1, read2 in secondlines: read1 = read1.strip() read2 = read2.strip() # Progress info if n % 1000000 == 0: print("Processed 1,000,000 reads in {}. Total " "reads: {:,} in child {}".format( secondsToText.secondsToText(time.time() - t), n, os.getpid())) sys.stdout.flush() t = time.time() # Get cell and umi barcodes. cell_barcode = read1[barcode_slice] # This change in bytes is required by umi_tools for umi correction UMI = bytes(read1[umi_slice], 'ascii') # Trim potential starting sequences TAG_seq = read2[start_trim:] if cell_barcode not in results: results[cell_barcode] = defaultdict(Counter) best_match = find_best_match(TAG_seq, tags, maximum_distance) results[cell_barcode][best_match][UMI] += 1 if (best_match == 'unmapped'): no_match[TAG_seq] += 1 if debug: print( "\nline:{0}\n" "cell_barcode:{1}\tUMI:{2}\tTAG_seq:{3}\n" "line length:{4}\tcell barcode length:{5}\tUMI length:{6}\tTAG sequence length:{7}\n" "Best match is: {8}".format(read1 + read2, cell_barcode, UMI, TAG_seq, len(read1 + read2), len(cell_barcode), len(UMI), len(TAG_seq), best_match)) sys.stdout.flush() n += 1 print("Mapping done for process {}. Processed {:,} reads".format( os.getpid(), n - 1)) sys.stdout.flush() return (results, no_match)