Example #1
0
def test_write_to_files(data, tmpdir):
    import gzip
    import scipy
    io.write_to_files(pytest.sparse_matrix, pytest.top_cells,
                      pytest.ordered_tags_map, pytest.data_type, tmpdir)
    file = tmpdir.join('umi_count/matrix.mtx.gz')
    with gzip.open(file, 'rb') as mtx_file:
        assert isinstance(scipy.io.mmread(mtx_file),
                          scipy.sparse.coo.coo_matrix)
Example #2
0
def main():
    # Create logger and stream handler
    logger = logging.getLogger("cite_seq_count")
    logger.setLevel(logging.CRITICAL)
    ch = logging.StreamHandler()
    ch.setLevel(logging.CRITICAL)
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    start_time = time.time()
    parser = get_args()
    if not sys.argv[1:]:
        parser.print_help(file=sys.stderr)
        sys.exit(2)

    # Parse arguments.
    args = parser.parse_args()
    if args.whitelist:
        print("Loading whitelist")
        (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv(
            filename=args.whitelist,
            barcode_length=args.cb_last - args.cb_first + 1,
            collapsing_threshold=args.bc_threshold,
        )
    else:
        whitelist = False

    # Load TAGs/ABs.
    ab_map = preprocessing.parse_tags_csv(args.tags)
    ab_map = preprocessing.check_tags(ab_map, args.max_error)

    # Identify input file(s)
    read1_paths, read2_paths = preprocessing.get_read_paths(
        args.read1_path, args.read2_path
    )

    # preprocessing and processing occur in separate loops so the program can crash earlier if
    # one of the inputs is not valid.
    read1_lengths = []
    read2_lengths = []
    for read1_path, read2_path in zip(read1_paths, read2_paths):
        # Get reads length. So far, there is no validation for Read2.
        read1_lengths.append(preprocessing.get_read_length(read1_path))
        read2_lengths.append(preprocessing.get_read_length(read2_path))
        # Check Read1 length against CELL and UMI barcodes length.
        (
            barcode_slice,
            umi_slice,
            barcode_umi_length,
        ) = preprocessing.check_barcodes_lengths(
            read1_lengths[-1],
            args.cb_first,
            args.cb_last,
            args.umi_first,
            args.umi_last,
        )
    # Ensure all files have the same input length
    # if len(set(read1_lengths)) != 1:
    #    sys.exit('Input barcode fastqs (read1) do not all have same length.\nExiting')

    # Initialize the counts dicts that will be generated from each input fastq pair
    final_results = defaultdict(lambda: defaultdict(Counter))
    umis_per_cell = Counter()
    reads_per_cell = Counter()
    merged_no_match = Counter()
    number_of_samples = len(read1_paths)
    n_reads = 0

    # Print a statement if multiple files are run.
    if number_of_samples != 1:
        print("Detected {} files to run on.".format(number_of_samples))

    for read1_path, read2_path in zip(read1_paths, read2_paths):
        if args.first_n:
            n_lines = (args.first_n * 4) / number_of_samples
        else:
            n_lines = preprocessing.get_n_lines(read1_path)
        n_reads += int(n_lines / 4)
        n_threads = args.n_threads
        print("Started mapping")
        print("Processing {:,} reads".format(n_reads))
        # Run with one process
        if n_threads <= 1 or n_reads < 1000001:
            print("CITE-seq-Count is running with one core.")
            (_final_results, _merged_no_match) = processing.map_reads(
                read1_path=read1_path,
                read2_path=read2_path,
                tags=ab_map,
                barcode_slice=barcode_slice,
                umi_slice=umi_slice,
                indexes=[0, n_reads],
                whitelist=whitelist,
                debug=args.debug,
                start_trim=args.start_trim,
                maximum_distance=args.max_error,
                sliding_window=args.sliding_window,
            )
            print("Mapping done")
            _umis_per_cell = Counter()
            _reads_per_cell = Counter()
            for cell_barcode, counts in _final_results.items():
                _umis_per_cell[cell_barcode] = sum([len(counts[UMI]) for UMI in counts])
                _reads_per_cell[cell_barcode] = sum(
                    [sum(counts[UMI].values()) for UMI in counts]
                )
        else:
            # Run with multiple processes
            print("CITE-seq-Count is running with {} cores.".format(n_threads))
            p = Pool(processes=n_threads)
            chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads)
            parallel_results = []

            for indexes in chunk_indexes:
                p.apply_async(
                    processing.map_reads,
                    args=(
                        read1_path,
                        read2_path,
                        ab_map,
                        barcode_slice,
                        umi_slice,
                        indexes,
                        whitelist,
                        args.debug,
                        args.start_trim,
                        args.max_error,
                        args.sliding_window,
                    ),
                    callback=parallel_results.append,
                    error_callback=sys.stderr,
                )
            p.close()
            p.join()
            print("Mapping done")
            print("Merging results")

            (
                _final_results,
                _umis_per_cell,
                _reads_per_cell,
                _merged_no_match,
            ) = processing.merge_results(parallel_results=parallel_results)
            del parallel_results

        # Update the overall counts dicts
        umis_per_cell.update(_umis_per_cell)
        reads_per_cell.update(_reads_per_cell)
        merged_no_match.update(_merged_no_match)
        for cell_barcode in _final_results:
            for tag in _final_results[cell_barcode]:
                if tag in final_results[cell_barcode]:
                    # Counter + Counter = Counter
                    final_results[cell_barcode][tag] += _final_results[cell_barcode][
                        tag
                    ]
                else:
                    # Explicitly save the counter to that tag
                    final_results[cell_barcode][tag] = _final_results[cell_barcode][tag]
    ordered_tags_map = OrderedDict()
    for i, tag in enumerate(ab_map.values()):
        ordered_tags_map[tag] = i
    ordered_tags_map["unmapped"] = i + 1

    # Correct cell barcodes
    if args.bc_threshold > 0:
        if len(umis_per_cell) <= args.expected_cells:
            print(
                "Number of expected cells, {}, is higher "
                "than number of cells found {}.\nNot performing"
                "cell barcode correction"
                "".format(args.expected_cells, len(umis_per_cell))
            )
            bcs_corrected = 0
        else:
            print("Correcting cell barcodes")
            if not whitelist:
                (
                    final_results,
                    umis_per_cell,
                    bcs_corrected,
                ) = processing.correct_cells(
                    final_results=final_results,
                    reads_per_cell=reads_per_cell,
                    umis_per_cell=umis_per_cell,
                    expected_cells=args.expected_cells,
                    collapsing_threshold=args.bc_threshold,
                    ab_map=ordered_tags_map,
                )
            else:
                (
                    final_results,
                    umis_per_cell,
                    bcs_corrected,
                ) = processing.correct_cells_whitelist(
                    final_results=final_results,
                    umis_per_cell=umis_per_cell,
                    whitelist=whitelist,
                    collapsing_threshold=args.bc_threshold,
                    ab_map=ordered_tags_map,
                )
    else:
        bcs_corrected = 0

    # If given, use whitelist for top cells
    if whitelist:
        top_cells = whitelist
        # Add potential missing cell barcodes.
        for missing_cell in whitelist:
            if missing_cell in final_results:
                continue
            else:
                final_results[missing_cell] = dict()
                for TAG in ordered_tags_map:
                    final_results[missing_cell][TAG] = Counter()
                top_cells.add(missing_cell)
    else:
        # Select top cells based on total umis per cell
        top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
        top_cells = set([pair[0] for pair in top_cells_tuple])

    # UMI correction

    if args.no_umi_correction:
        # Don't correct
        umis_corrected = 0
        aberrant_cells = set()
    else:
        # Correct UMIS
        (final_results, umis_corrected, aberrant_cells) = processing.correct_umis(
            final_results=final_results,
            collapsing_threshold=args.umi_threshold,
            top_cells=top_cells,
            max_umis=20000,
        )

    # Remove aberrant cells from the top cells
    for cell_barcode in aberrant_cells:
        top_cells.remove(cell_barcode)

    # Create sparse aberrant cells matrix
    (umi_aberrant_matrix, read_aberrant_matrix) = processing.generate_sparse_matrices(
        final_results=final_results,
        ordered_tags_map=ordered_tags_map,
        top_cells=aberrant_cells,
    )

    # Write uncorrected cells to dense output
    io.write_dense(
        sparse_matrix=umi_aberrant_matrix,
        index=list(ordered_tags_map.keys()),
        columns=aberrant_cells,
        outfolder=os.path.join(args.outfolder, "uncorrected_cells"),
        filename="dense_umis.tsv",
    )

    # Create sparse matrices for results
    (umi_results_matrix, read_results_matrix) = processing.generate_sparse_matrices(
        final_results=final_results,
        ordered_tags_map=ordered_tags_map,
        top_cells=top_cells,
    )

    # Write umis to file
    io.write_to_files(
        sparse_matrix=umi_results_matrix,
        top_cells=top_cells,
        ordered_tags_map=ordered_tags_map,
        data_type="umi",
        outfolder=args.outfolder,
    )

    # Write reads to file
    io.write_to_files(
        sparse_matrix=read_results_matrix,
        top_cells=top_cells,
        ordered_tags_map=ordered_tags_map,
        data_type="read",
        outfolder=args.outfolder,
    )

    # Write unmapped sequences
    io.write_unmapped(
        merged_no_match=merged_no_match,
        top_unknowns=args.unknowns_top,
        outfolder=args.outfolder,
        filename=args.unmapped_file,
    )

    # Create report and write it to disk
    create_report(
        n_reads=n_reads,
        reads_per_cell=reads_per_cell,
        no_match=merged_no_match,
        version=version,
        start_time=start_time,
        ordered_tags_map=ordered_tags_map,
        umis_corrected=umis_corrected,
        bcs_corrected=bcs_corrected,
        bad_cells=aberrant_cells,
        args=args,
    )

    # Write dense matrix to disk if requested
    if args.dense:
        print("Writing dense format output")
        io.write_dense(
            sparse_matrix=umi_results_matrix,
            index=list(ordered_tags_map.keys()),
            columns=top_cells,
            outfolder=args.outfolder,
            filename="dense_umis.tsv",
        )
Example #3
0
def main():
    #Create logger and stream handler
    logger = logging.getLogger('cite_seq_count')
    logger.setLevel(logging.CRITICAL)
    ch = logging.StreamHandler()
    ch.setLevel(logging.CRITICAL)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    start_time = time.time()
    parser = get_args()
    if not sys.argv[1:]:
        parser.print_help(file=sys.stderr)
        sys.exit(2)

    # Parse arguments.
    args = parser.parse_args()
    if args.whitelist:
        (whitelist, args.bc_threshold) = preprocessing.parse_whitelist_csv(
            filename=args.whitelist,
            barcode_length=args.cb_last - args.cb_first + 1,
            collapsing_threshold=args.bc_threshold)
    else:
        whitelist = False

    # Load TAGs/ABs.
    ab_map = preprocessing.parse_tags_csv(args.tags)
    ab_map = preprocessing.check_tags(ab_map, args.max_error)
    # Get reads length. So far, there is no validation for Read2.
    read1_length = preprocessing.get_read_length(args.read1_path)
    read2_length = preprocessing.get_read_length(args.read2_path)
    # Check Read1 length against CELL and UMI barcodes length.
    (barcode_slice, umi_slice,
     barcode_umi_length) = preprocessing.check_barcodes_lengths(
         read1_length, args.cb_first, args.cb_last, args.umi_first,
         args.umi_last)

    if args.first_n:
        n_lines = args.first_n * 4
    else:
        n_lines = preprocessing.get_n_lines(args.read1_path)
    n_reads = int(n_lines / 4)
    n_threads = args.n_threads
    print('Started mapping')
    print('Processing {:,} reads'.format(n_reads))
    #Run with one process
    if n_threads <= 1 or n_reads < 1000001:
        print('CITE-seq-Count is running with one core.')
        (final_results, merged_no_match) = processing.map_reads(
            read1_path=args.read1_path,
            read2_path=args.read2_path,
            tags=ab_map,
            barcode_slice=barcode_slice,
            umi_slice=umi_slice,
            indexes=[0, n_reads],
            whitelist=whitelist,
            debug=args.debug,
            start_trim=args.start_trim,
            maximum_distance=args.max_error,
            sliding_window=args.sliding_window)
        print('Mapping done')
        umis_per_cell = Counter()
        reads_per_cell = Counter()
        for cell_barcode, counts in final_results.items():
            umis_per_cell[cell_barcode] = sum(
                [len(counts[UMI]) for UMI in counts])
            reads_per_cell[cell_barcode] = sum(
                [sum(counts[UMI].values()) for UMI in counts])
    else:
        # Run with multiple processes
        print('CITE-seq-Count is running with {} cores.'.format(n_threads))
        p = Pool(processes=n_threads)
        chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads)
        parallel_results = []

        for indexes in chunk_indexes:
            p.apply_async(processing.map_reads,
                          args=(args.read1_path, args.read2_path, ab_map,
                                barcode_slice, umi_slice, indexes, whitelist,
                                args.debug, args.start_trim, args.max_error,
                                args.sliding_window),
                          callback=parallel_results.append,
                          error_callback=sys.stderr)
        p.close()
        p.join()
        print('Mapping done')
        print('Merging results')

        (final_results, umis_per_cell, reads_per_cell,
         merged_no_match) = processing.merge_results(
             parallel_results=parallel_results)
        del (parallel_results)

    ordered_tags_map = OrderedDict()
    for i, tag in enumerate(ab_map.values()):
        ordered_tags_map[tag] = i
    ordered_tags_map['unmapped'] = i + 1

    # Correct cell barcodes
    if (len(umis_per_cell) <= args.expected_cells):
        print("Number of expected cells, {}, is higher " \
            "than number of cells found {}.\nNot performing" \
            "cell barcode correction" \
            "".format(args.expected_cells, len(umis_per_cell)))
        bcs_corrected = 0
    else:
        print('Correcting cell barcodes')
        if not whitelist:
            (final_results, umis_per_cell,
             bcs_corrected) = processing.correct_cells(
                 final_results=final_results,
                 umis_per_cell=umis_per_cell,
                 expected_cells=args.expected_cells,
                 collapsing_threshold=args.bc_threshold)
        else:
            (final_results, umis_per_cell,
             bcs_corrected) = processing.correct_cells_whitelist(
                 final_results=final_results,
                 umis_per_cell=umis_per_cell,
                 whitelist=whitelist,
                 collapsing_threshold=args.bc_threshold)

    # Correct umi barcodes
    if not whitelist:
        top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
        top_cells = set([pair[0] for pair in top_cells_tuple])

    # Sort cells by number of mapped umis
    else:
        top_cells = whitelist
        # Add potential missing cell barcodes.
        for missing_cell in whitelist:
            if missing_cell in final_results:
                continue
            else:
                final_results[missing_cell] = dict()
                for TAG in ordered_tags_map:
                    final_results[missing_cell][TAG] = Counter()
                top_cells.add(missing_cell)
    #If we want umi correction
    if not args.no_umi_correction:
        (final_results, umis_corrected,
         aberrant_cells) = processing.correct_umis(
             final_results=final_results,
             collapsing_threshold=args.umi_threshold,
             top_cells=top_cells,
             max_umis=20000)
    else:
        umis_corrected = 0
        aberrant_cells = set()
    for cell_barcode in aberrant_cells:
        top_cells.remove(cell_barcode)
    #Create sparse aberrant cells matrix
    (umi_aberrant_matrix,
     read_aberrant_matrix) = processing.generate_sparse_matrices(
         final_results=final_results,
         ordered_tags_map=ordered_tags_map,
         top_cells=aberrant_cells)

    #Write uncorrected cells to dense output
    io.write_dense(sparse_matrix=umi_aberrant_matrix,
                   index=list(ordered_tags_map.keys()),
                   columns=aberrant_cells,
                   outfolder=os.path.join(args.outfolder, 'uncorrected_cells'),
                   filename='dense_umis.tsv')

    (umi_results_matrix,
     read_results_matrix) = processing.generate_sparse_matrices(
         final_results=final_results,
         ordered_tags_map=ordered_tags_map,
         top_cells=top_cells)
    # Write umis to file
    io.write_to_files(sparse_matrix=umi_results_matrix,
                      top_cells=top_cells,
                      ordered_tags_map=ordered_tags_map,
                      data_type='umi',
                      outfolder=args.outfolder)
    # Write reads to file
    io.write_to_files(sparse_matrix=read_results_matrix,
                      top_cells=top_cells,
                      ordered_tags_map=ordered_tags_map,
                      data_type='read',
                      outfolder=args.outfolder)

    top_unmapped = merged_no_match.most_common(args.unknowns_top)

    with open(os.path.join(args.outfolder, args.unmapped_file),
              'w') as unknown_file:
        unknown_file.write('tag,count\n')
        for element in top_unmapped:
            unknown_file.write('{},{}\n'.format(element[0], element[1]))
    create_report(n_reads=n_reads,
                  reads_per_cell=reads_per_cell,
                  no_match=merged_no_match,
                  version=version,
                  start_time=start_time,
                  ordered_tags_map=ordered_tags_map,
                  umis_corrected=umis_corrected,
                  bcs_corrected=bcs_corrected,
                  bad_cells=aberrant_cells,
                  args=args)
    if args.dense:
        print('Writing dense format output')
        io.write_dense(sparse_matrix=umi_results_matrix,
                       index=list(ordered_tags_map.keys()),
                       columns=top_cells,
                       outfolder=args.outfolder,
                       filename='dense_umis.tsv')
Example #4
0
def main():
    start_time = time.time()
    parser = get_args()
    if not sys.argv[1:]:
        parser.print_help(file=sys.stderr)
        sys.exit(2)

    # Parse arguments.
    args = parser.parse_args()
    if args.whitelist:
        whitelist = preprocessing.parse_whitelist_csv(
            args.whitelist, args.cb_last - args.cb_first + 1)
    else:
        whitelist = None

    # Load TAGs/ABs.
    ab_map = preprocessing.parse_tags_csv(args.tags)
    ab_map = preprocessing.check_tags(ab_map, args.max_error)
    # Get reads length. So far, there is no validation for Read2.
    read1_length = preprocessing.get_read_length(args.read1_path)
    read2_length = preprocessing.get_read_length(args.read2_path)
    # Check Read1 length against CELL and UMI barcodes length.
    (barcode_slice, umi_slice,
     barcode_umi_length) = preprocessing.check_barcodes_lengths(
         read1_length, args.cb_first, args.cb_last, args.umi_first,
         args.umi_last)

    if args.first_n:
        n_lines = args.first_n * 4
    else:
        n_lines = preprocessing.get_n_lines(args.read1_path)
    n_reads = int(n_lines / 4)
    n_threads = args.n_threads

    print('Started mapping')
    #Run with one process
    if n_threads <= 1 or n_reads < 1000001:
        print('CITE-seq-Count is running with one core.')
        (final_results, merged_no_match) = processing.map_reads(
            read1_path=args.read1_path,
            read2_path=args.read2_path,
            tags=ab_map,
            barcode_slice=barcode_slice,
            umi_slice=umi_slice,
            indexes=[0, n_reads],
            whitelist=whitelist,
            debug=args.debug,
            start_trim=args.start_trim,
            maximum_distance=args.max_error)
        print('Mapping done')
        umis_per_cell = Counter()
        reads_per_cell = Counter()
        for cell_barcode, counts in final_results.items():
            umis_per_cell[cell_barcode] = sum(
                [len(counts[UMI]) for UMI in counts if UMI != 'unmapped'])
            reads_per_cell[cell_barcode] = sum([
                sum(counts[UMI].values()) for UMI in counts
                if UMI != 'unmapped'
            ])
    else:
        # Run with multiple processes
        print('CITE-seq-Count is running with {} cores.'.format(n_threads))
        p = Pool(processes=n_threads)
        chunk_indexes = preprocessing.chunk_reads(n_reads, n_threads)
        parallel_results = []

        for indexes in chunk_indexes:
            p.apply_async(processing.map_reads,
                          args=(args.read1_path, args.read2_path, ab_map,
                                barcode_slice, umi_slice, indexes, whitelist,
                                args.debug, args.start_trim, args.max_error),
                          callback=parallel_results.append,
                          error_callback=sys.stderr)
        p.close()
        p.join()
        print('Mapping done')
        print('Merging results')
        (final_results, umis_per_cell, reads_per_cell,
         merged_no_match) = processing.merge_results(
             parallel_results=parallel_results)
        del (parallel_results)

    # Correct cell barcodes
    (final_results, umis_per_cell, bcs_corrected) = processing.correct_cells(
        final_results=final_results,
        umis_per_cell=umis_per_cell,
        expected_cells=args.expected_cells,
        collapsing_threshold=args.bc_threshold)

    # Correct umi barcodes
    (final_results, umis_corrected) = processing.correct_umis(
        final_results=final_results, collapsing_threshold=args.umi_threshold)

    ordered_tags_map = OrderedDict()
    for i, tag in enumerate(ab_map.values()):
        ordered_tags_map[tag] = i
    ordered_tags_map['unmapped'] = i + 1

    # Sort cells by number of mapped umis
    if not whitelist:
        top_cells_tuple = umis_per_cell.most_common(args.expected_cells)
        top_cells = set([pair[0] for pair in top_cells_tuple])
    else:
        top_cells = whitelist
        # Add potential missing cell barcodes.
        for missing_cell in whitelist:
            if missing_cell in final_results:
                continue
            else:
                final_results[missing_cell] = dict()
                for TAG in ordered_tags_map:
                    final_results[missing_cell][TAG] = 0
                top_cells.add(missing_cell)

    (umi_results_matrix,
     read_results_matrix) = processing.generate_sparse_matrices(
         final_results=final_results,
         ordered_tags_map=ordered_tags_map,
         top_cells=top_cells)
    io.write_to_files(sparse_matrix=umi_results_matrix,
                      top_cells=top_cells,
                      ordered_tags_map=ordered_tags_map,
                      data_type='umi',
                      outfolder=args.outfolder)
    io.write_to_files(sparse_matrix=read_results_matrix,
                      top_cells=top_cells,
                      ordered_tags_map=ordered_tags_map,
                      data_type='read',
                      outfolder=args.outfolder)

    top_unmapped = merged_no_match.most_common(args.unknowns_top)
    with open(os.path.join(args.outfolder, args.unmapped_file),
              'w') as unknown_file:
        unknown_file.write('tag,count\n')
        for element in top_unmapped:
            unknown_file.write('{},{}\n'.format(element[0], element[1]))
    create_report(n_reads=n_reads,
                  reads_per_cell=reads_per_cell,
                  no_match=merged_no_match,
                  version=version,
                  start_time=start_time,
                  ordered_tags_map=ordered_tags_map,
                  umis_corrected=umis_corrected,
                  bcs_corrected=bcs_corrected,
                  args=args)
    if args.dense:
        print('Writing dense format output')
        io.write_dense(sparse_matrix=umi_results_matrix,
                       index=list(ordered_tags_map.keys()),
                       columns=top_cells,
                       file_path=os.path.join(args.outfolder,
                                              'dense_umis.tsv'))