Exemple #1
0
def create_report(n_reads, reads_per_cell, no_match, version, start_time,
                  ordered_tags_map, umis_corrected, bcs_corrected, bad_cells,
                  args):
    """
    Creates a report with details about the run in a yaml format.

    Args:
        n_reads (int): Number of reads that have been processed.
        reads_matrix (scipy.sparse.dok_matrix): A sparse matrix continining read counts.
        no_match (Counter): Counter of unmapped tags.
        version (string): CITE-seq-Count package version.
        start_time (time): Start time of the run.
        args (arg_parse): Arguments provided by the user.

    """
    total_unmapped = sum(no_match.values())
    total_mapped = sum(reads_per_cell.values()) - total_unmapped
    mapped_perc = round((total_mapped / n_reads) * 100)
    unmapped_perc = round((total_unmapped / n_reads) * 100)

    with open(os.path.join(args.outfolder, 'run_report.yaml'),
              'w') as report_file:
        report_file.write("""Date: {}
Running time: {}
CITE-seq-Count Version: {}
Reads processed: {}
Percentage mapped: {}
Percentage unmapped: {}
Uncorrected cells: {}
Correction:
\tCell barcodes collapsing threshold: {}
\tCell barcodes corrected: {}
\tUMI collapsing threshold: {}
\tUMIs corrected: {}
Run parameters:
\tRead1_filename: {}
\tRead2_filename: {}
\tCell barcode:
\t\tFirst position: {}
\t\tLast position: {}
\tUMI barcode:
\t\tFirst position: {}
\t\tLast position: {}
\tExpected cells: {}
\tTags max errors: {}
\tStart trim: {}
""".format(datetime.datetime.today().strftime('%Y-%m-%d'),
           secondsToText.secondsToText(time.time() - start_time), version,
           n_reads, mapped_perc, unmapped_perc, len(bad_cells),
           args.bc_threshold, bcs_corrected, args.umi_threshold,
           umis_corrected, args.read1_path, args.read2_path, args.cb_first,
           args.cb_last, args.umi_first, args.umi_last, args.expected_cells,
           args.max_error, args.start_trim))
def map_reads(read1_path, read2_path, tags, barcode_slice, umi_slice, indexes,
              whitelist, debug, start_trim, maximum_distance):
    """Read through R1/R2 files and generate a islice starting at a specific index.

    It reads both Read1 and Read2 files, creating a dict based on cell barcode.

    Args:
        read1_path (string): Path to R1.fastq.gz
        read2_path (string): Path to R2.fastq.gz
        chunk_size (int): The number of lines to process 
        tags (dict): A dictionary with the TAGs + TAG Names.
        barcode_slice (slice): A slice for extracting the Barcode portion from the
            sequence.
        umi_slice (slice): A slice for extracting the UMI portion from the
            sequence.
        indexes (list): Pair of first and last index for islice
        whitelist (set): The set of white-listed barcodes.
        debug (bool): Print debug messages. Default is False.
        start_trim (int): Number of bases to trim at the start.
        maximum_distance (int): Maximum distance given by the user.

    Returns:
        results (dict): A dict of dict of Counters with the mapping results.
        no_match (Counter): A counter with unmapped sequences.
    """
    # Initiate values
    results = {}
    no_match = Counter()
    n = 1
    t = time.time()
    with gzip.open(read1_path, 'rt') as textfile1, \
         gzip.open(read2_path, 'rt') as textfile2:

        # Read all 2nd lines from 4 line chunks. If first_n not None read only 4 times the given amount.
        secondlines = islice(zip(textfile1, textfile2), indexes[0] * 4 + 1,
                             indexes[1] * 4 + 1, 4)
        for read1, read2 in secondlines:
            read1 = read1.strip()
            read2 = read2.strip()

            # Progress info
            if n % 1000000 == 0:
                print("Processed 1,000,000 reads in {}. Total "
                      "reads: {:,} in child {}".format(
                          secondsToText.secondsToText(time.time() - t), n,
                          os.getpid()))
                sys.stdout.flush()
                t = time.time()

            # Get cell and umi barcodes.
            cell_barcode = read1[barcode_slice]
            # This change in bytes is required by umi_tools for umi correction
            UMI = bytes(read1[umi_slice], 'ascii')
            # Trim potential starting sequences
            TAG_seq = read2[start_trim:]

            if cell_barcode not in results:
                results[cell_barcode] = defaultdict(Counter)

            best_match = find_best_match(TAG_seq, tags, maximum_distance)

            results[cell_barcode][best_match][UMI] += 1

            if (best_match == 'unmapped'):
                no_match[TAG_seq] += 1

            if debug:
                print(
                    "\nline:{0}\n"
                    "cell_barcode:{1}\tUMI:{2}\tTAG_seq:{3}\n"
                    "line length:{4}\tcell barcode length:{5}\tUMI length:{6}\tTAG sequence length:{7}\n"
                    "Best match is: {8}".format(read1 + read2,
                                                cell_barcode, UMI, TAG_seq,
                                                len(read1 + read2),
                                                len(cell_barcode), len(UMI),
                                                len(TAG_seq), best_match))
                sys.stdout.flush()
            n += 1
    print("Mapping done for process {}. Processed {:,} reads".format(
        os.getpid(), n - 1))
    sys.stdout.flush()
    return (results, no_match)