This is the third and final script of the python pipeline. It is meant to aggregate the results from each recover_cdr3s worker and summarize the information Run using python -m summary <args> Requirements: Python >3.8.5, samtools, pysam, biopython, pandas """ import argparse from textwrap import dedent, indent import os import tcrgo.io as io from tcrgo.collapse import collapse from tcrgo import Log log = Log('root') def main(args): log.init(args.verbosity) if args.workers == "ALL": worker_range = io.list_cdr3_files(args.input_path) else: worker_range = [int(i) for i in args.workers.strip(':').split(':')] if worker_range[0] < 1: log.error("Please enter positive value for the start of the range.") if len(worker_range) > 1: worker_range = range(worker_range[0], worker_range[-1]+1) else: worker_range = range(1, worker_range[-1]+1) log.info("Aggregating CDR3 info files outputted from the recover_cdr3s script.")
Requirements: Python >3.8.5, samtools, pysam, biopython, pandas """ import argparse import os.path from textwrap import dedent, indent import pysam from typing import List, Dict, Iterator, Set from pathlib import Path from tcrgo.bam import BAMDict, ReferenceDict import tcrgo.io as io from tcrgo import Log log = Log("root") def main(args): log.init(args.verbosity) log.info("Parsing input data...") bam = io.sort_and_index(args.bam, args.output_path) log.info("Reading CDR3 positions file and FASTA file...") cdr3_positions = None if args.cdr3_positions_file is not None: cdr3_positions = io.read_cdr3_positions(args.cdr3_positions_file) log.verbose(f"{cdr3_positions}") entries = io.parse_fasta(args.fasta) refdict = ReferenceDict() refdict.build(entries, cdr3_positions, args.zero_indexed)
"""Alignment and preprocessing via Drop-Seq Tools 2.4.0 and Bowtie2 version 2.4.1""" import tcrgo.dropseq_tools as ds import tcrgo.io as io import argparse from textwrap import dedent, indent from pathlib import Path import os import pysam from tcrgo import Log log = Log(name=__name__) def main(args): """ 0. Raw BAM -> Single-end FASTQ -> Pair-end FASTQs -> Unmapped BAM 1. Unmapped BAM -> aligned and tagged BAM a. Tag cell barcodes b. Tag molecular barcodes c. Trim 5’ primer sequence d. Trim 3’ polyA sequence e. SAM -> Fastq f. STAR alignment g. Sort STAR alignment in queryname order h. Merge STAR alignment tagged SAM to recover cell/molecular barcodes i. Add gene/exon and other annotation tags j. Barcode Repair i. Repair substitution errors (DetectBeadSubstitutionErrors) ii. Repair indel errors (DetectBeadSynthesisErrors) """ log.init(args.verbosity)